@@ -11,13 +11,94 @@ def main():
1111 parser .add_argument ('--db' , required = True , help = 'Path to index.db' )
1212 parser .add_argument ('--source' , required = True , help = 'Source directory' )
1313 parser .add_argument ('--limit' , type = int , default = 20 , help = 'Number of samples' )
14+ parser .add_argument ('--deep' , action = 'store_true' , help = 'Deep analysis of all files' )
1415
1516 args = parser .parse_args ()
1617
1718 conn = sqlite3 .connect (args .db )
1819 conn .row_factory = sqlite3 .Row
1920 cur = conn .cursor ()
2021
22+ if args .deep :
23+ # Deep analysis - check ALL files and categorize failures
24+ print (f"\n { '=' * 70 } " )
25+ print (f" Deep ContentID Analysis - Checking ALL files" )
26+ print (f"{ '=' * 70 } \n " )
27+
28+ cur .execute ("SELECT COUNT(*) FROM files WHERE contentID IS NOT NULL AND contentID != ''" )
29+ total = cur .fetchone ()[0 ]
30+ print (f"Total files with contentID: { total :,} " )
31+
32+ cur .execute ("""
33+ SELECT id, name, parentID, contentID
34+ FROM files
35+ WHERE contentID IS NOT NULL AND contentID != ''
36+ """ )
37+
38+ found = 0
39+ not_found = 0
40+ is_directory = 0
41+ empty_contentid = 0
42+ sample_not_found = []
43+ sample_is_dir = []
44+
45+ for row in cur :
46+ content_id = row ['contentID' ]
47+
48+ if not content_id or content_id .strip () == '' :
49+ empty_contentid += 1
50+ continue
51+
52+ # Try to find the file
53+ candidates = [
54+ os .path .join (args .source , content_id [0 ], content_id ),
55+ os .path .join (args .source , content_id ),
56+ ]
57+
58+ found_path = None
59+ for c in candidates :
60+ if os .path .exists (c ):
61+ found_path = c
62+ break
63+
64+ if found_path :
65+ if os .path .isdir (found_path ):
66+ is_directory += 1
67+ if len (sample_is_dir ) < 5 :
68+ sample_is_dir .append ((row ['name' ], content_id , found_path ))
69+ else :
70+ found += 1
71+ else :
72+ not_found += 1
73+ if len (sample_not_found ) < 10 :
74+ sample_not_found .append ((row ['name' ], content_id , candidates [0 ]))
75+
76+ print (f"\n Results:" )
77+ print (f" ✅ Found (files): { found :,} ({ 100 * found / total :.1f} %)" )
78+ print (f" 📁 Found (dirs): { is_directory :,} ({ 100 * is_directory / total :.1f} %)" )
79+ print (f" ❌ Not found: { not_found :,} ({ 100 * not_found / total :.1f} %)" )
80+ print (f" ⚠️ Empty contentID: { empty_contentid :,} " )
81+
82+ if sample_not_found :
83+ print (f"\n Sample NOT FOUND files:" )
84+ print ("-" * 70 )
85+ for name , cid , path in sample_not_found :
86+ print (f" Name: { name } " )
87+ print (f" contentID: { cid } " )
88+ print (f" Expected: { path } " )
89+ print ()
90+
91+ if sample_is_dir :
92+ print (f"\n Sample DIRECTORY entries (contentID points to dir):" )
93+ print ("-" * 70 )
94+ for name , cid , path in sample_is_dir :
95+ print (f" Name: { name } " )
96+ print (f" contentID: { cid } " )
97+ print (f" Path: { path } " )
98+ print ()
99+
100+ return
101+
21102 # Get sample contentIDs
22103 cur .execute ("""
23104 SELECT id, name, parentID, contentID
0 commit comments