Skip to content

Commit 08b846c

Browse files
committed
Add --deep flag to debug_contentid.py for comprehensive analysis
1 parent fb23d99 commit 08b846c

1 file changed

Lines changed: 81 additions & 0 deletions

File tree

scripts/debug_contentid.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,94 @@ def main():
1111
parser.add_argument('--db', required=True, help='Path to index.db')
1212
parser.add_argument('--source', required=True, help='Source directory')
1313
parser.add_argument('--limit', type=int, default=20, help='Number of samples')
14+
parser.add_argument('--deep', action='store_true', help='Deep analysis of all files')
1415

1516
args = parser.parse_args()
1617

1718
conn = sqlite3.connect(args.db)
1819
conn.row_factory = sqlite3.Row
1920
cur = conn.cursor()
2021

22+
if args.deep:
23+
# Deep analysis - check ALL files and categorize failures
24+
print(f"\n{'='*70}")
25+
print(f" Deep ContentID Analysis - Checking ALL files")
26+
print(f"{'='*70}\n")
27+
28+
cur.execute("SELECT COUNT(*) FROM files WHERE contentID IS NOT NULL AND contentID != ''")
29+
total = cur.fetchone()[0]
30+
print(f"Total files with contentID: {total:,}")
31+
32+
cur.execute("""
33+
SELECT id, name, parentID, contentID
34+
FROM files
35+
WHERE contentID IS NOT NULL AND contentID != ''
36+
""")
37+
38+
found = 0
39+
not_found = 0
40+
is_directory = 0
41+
empty_contentid = 0
42+
sample_not_found = []
43+
sample_is_dir = []
44+
45+
for row in cur:
46+
content_id = row['contentID']
47+
48+
if not content_id or content_id.strip() == '':
49+
empty_contentid += 1
50+
continue
51+
52+
# Try to find the file
53+
candidates = [
54+
os.path.join(args.source, content_id[0], content_id),
55+
os.path.join(args.source, content_id),
56+
]
57+
58+
found_path = None
59+
for c in candidates:
60+
if os.path.exists(c):
61+
found_path = c
62+
break
63+
64+
if found_path:
65+
if os.path.isdir(found_path):
66+
is_directory += 1
67+
if len(sample_is_dir) < 5:
68+
sample_is_dir.append((row['name'], content_id, found_path))
69+
else:
70+
found += 1
71+
else:
72+
not_found += 1
73+
if len(sample_not_found) < 10:
74+
sample_not_found.append((row['name'], content_id, candidates[0]))
75+
76+
print(f"\nResults:")
77+
print(f" ✅ Found (files): {found:,} ({100*found/total:.1f}%)")
78+
print(f" 📁 Found (dirs): {is_directory:,} ({100*is_directory/total:.1f}%)")
79+
print(f" ❌ Not found: {not_found:,} ({100*not_found/total:.1f}%)")
80+
print(f" ⚠️ Empty contentID: {empty_contentid:,}")
81+
82+
if sample_not_found:
83+
print(f"\nSample NOT FOUND files:")
84+
print("-" * 70)
85+
for name, cid, path in sample_not_found:
86+
print(f" Name: {name}")
87+
print(f" contentID: {cid}")
88+
print(f" Expected: {path}")
89+
print()
90+
91+
if sample_is_dir:
92+
print(f"\nSample DIRECTORY entries (contentID points to dir):")
93+
print("-" * 70)
94+
for name, cid, path in sample_is_dir:
95+
print(f" Name: {name}")
96+
print(f" contentID: {cid}")
97+
print(f" Path: {path}")
98+
print()
99+
100+
return
101+
21102
# Get sample contentIDs
22103
cur.execute("""
23104
SELECT id, name, parentID, contentID

0 commit comments

Comments
 (0)