For a more reliable duplicate detection that works across different filenames and folder structures, we should use file content comparison rather than just names/sizes/dimensions. Here's an improved script that uses file hashing for accurate duplicate detection:
```python
import os
import shutil
import hashlib
from datetime import datetime
def get_file_hash(filepath, chunk_size=8192):
"""Generate MD5 hash of file content (more reliable than size/dimensions)"""
md5 = hashlib.md5()
try:
with open(filepath, 'rb') as f:
while chunk := f.read(chunk_size):
md5.update(chunk)
return md5.hexdigest()
except Exception as e:
print(f" [!] Error hashing {filepath}: {e}")
return None
def find_and_move_duplicates(folder1, folder2):
"""Find duplicates by content hash and move from folder1 to duplicates folder"""
# Create duplicates folder with timestamp
duplicates_folder = os.path.join(folder1, f"duplicates_{datetime.now().strftime('%Y%m%d_%H%M%S')}")
os.makedirs(duplicates_folder, exist_ok=True)
print(f"\n{'='*60}")
print("ADVANCED DUPLICATE CLEANER")
print(f"Source Folder: {folder1}")
print(f"Reference Folder: {folder2}")
print(f"Duplicates will be moved to: {duplicates_folder}")
print(f"{'='*60}\n")
# Phase 1: Build hash index of reference folder (folder2)
print("[PHASE 1] Building content hash index of reference folder...")
ref_hashes = set()
total_ref_files = 0
for root, _, files in os.walk(folder2):
for file in files:
filepath = os.path.join(root, file)
file_hash = get_file_hash(filepath)
if file_hash:
ref_hashes.add(file_hash)
total_ref_files += 1
print(f" Indexed: {total_ref_files} files | Current: {file[:50]}...", end='\r')
print(f"\n Finished indexing {total_ref_files} files in reference folder")
print(f" Unique content hashes: {len(ref_hashes)}")
# Phase 2: Scan and clean folder1
print("\n[PHASE 2] Finding and moving duplicates from source folder...")
moved_files = []
skipped_files = []
total_checked = 0
for root, _, files in os.walk(folder1):
for file in files:
filepath = os.path.join(root, file)
rel_path = os.path.relpath(filepath, folder1)
total_checked += 1
# Skip files already in duplicates folder
if root.startswith(duplicates_folder):
continue
print(f" Checking {total_checked} files | Current: {file[:50]}...", end='\r')
file_hash = get_file_hash(filepath)
if not file_hash:
skipped_files.append(filepath)
continue
if file_hash in ref_hashes:
# Prepare destination path
dest_path = os.path.join(duplicates_folder, rel_path)
os.makedirs(os.path.dirname(dest_path), exist_ok=True)
# Handle naming conflicts
counter = 1
while os.path.exists(dest_path):
base, ext = os.path.splitext(dest_path)
dest_path = f"{base}_{counter}{ext}"
counter += 1
try:
shutil.move(filepath, dest_path)
moved_files.append((filepath, dest_path))
print(f" MOVED: {rel_path} (duplicate found)")
except Exception as e:
print(f" [!] Failed to move {rel_path}: {e}")
skipped_files.append(filepath)
# Generate report
report = [
"Advanced Duplicate Files Report",
"="*80,
f"Source Folder: {folder1}",
f"Reference Folder: {folder2}",
f"Duplicates Folder: {duplicates_folder}",
f"Scan Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
"="*80,
f"\nFiles checked: {total_checked}",
f"Files moved: {len(moved_files)}",
f"Files skipped: {len(skipped_files)}",
f"Reference files indexed: {total_ref_files}",
f"Unique content hashes: {len(ref_hashes)}",
"\nMOVED FILES:"
]
for src, dest in moved_files:
report.append(f"{os.path.relpath(src, folder1)} → {os.path.relpath(dest, folder1)}")
if skipped_files:
report.append("\nSKIPPED FILES:")
report.extend(skipped_files[:100]) # Limit to first 100 skipped files
report_path = os.path.join(folder1, "duplicates_report.txt")
with open(report_path, 'w', encoding='utf-8') as f:
f.write("\n".join(report))
print(f"\n{'='*60}")
print("OPERATION COMPLETE")
print(f"Files checked: {total_checked}")
print(f"Duplicates moved: {len(moved_files)}")
print(f"Report saved to: {report_path}")
print(f"{'='*60}")
def main():
print("ADVANCED DUPLICATE FILE CLEANER")
print("This version uses file content hashing for reliable duplicate detection")
print("Works across different filenames and folder structures\n")
# Get folder paths
folder = input("Enter path to either folder (with or without '1'): ").strip()
folder = folder.rstrip(os.sep)
# Auto-detect which folder is which
if folder.endswith('1'):
folder1 = folder
folder2 = folder[:-1]
else:
folder1 = folder + '1'
folder2 = folder
# Validate folders
if not os.path.isdir(folder1):
print(f"Error: Folder not found - {folder1}")
return
if not os.path.isdir(folder2):
print(f"Error: Folder not found - {folder2}")
return
print("\nFolder configuration:")
print(f"Folder to clean (ending with '1'): {folder1}")
print(f"Reference folder: {folder2}")
confirm = input("\nContinue? This may take time for large folders. (y/n): ").lower()
if confirm != 'y':
print("Operation cancelled.")
return
find_and_move_duplicates(folder1, folder2)
if __name__ == "__main__":
main()
```
### Key Improvements:
1. **Content-Based Detection**:
- Uses MD5 hashing of file contents (most reliable method)
- Works regardless of filenames or folder structures
- Detects identical files even if renamed
2. **Progress Tracking**:
- Shows real-time progress counters
- Displays currently processed file
- Provides ongoing status updates
3. **Better Handling**:
- Skips files that can't be read
- Preserves folder structure in duplicates folder
- Handles naming conflicts automatically
4. **Comprehensive Reporting**:
- Detailed statistics
- Complete list of moved files
- Error tracking
### How It Works:
1. **Indexing Phase**:
- Scans reference folder (without "1")
- Generates content hash for each file
- Stores all unique hashes
2. **Comparison Phase**:
- Scans folder ending with "1"
- Generates hash for each file
- Compares against reference hashes
- Moves matches to duplicates folder
3. **Optimizations**:
- Hash comparison is extremely fast
- Processes files in chunks (memory efficient)
- Skips files that are already in duplicates folder
This method will reliably find all duplicate files regardless of their names or locations in the folder structure, as it compares the actual file contents rather than metadata.