@@ -44,6 +44,15 @@ def missing_module(module_name: str) -> None:
4444build_dir = pathlib .Path (gitroot , "mad-generation-build" )
4545
4646
47+ def database_dir_for_project (name : str ) -> pathlib .Path :
48+ return build_dir / f"{ name } -db"
49+
50+
51+ def database_for_project_exists (name : str ) -> pathlib .Path :
52+ path = database_dir_for_project (name )
53+ return path .exists ()
54+
55+
4756# A project to generate models for
4857Project = TypedDict (
4958 "Project" ,
@@ -175,7 +184,7 @@ def clone_projects(projects: List[Project]) -> List[tuple[Project, str]]:
175184
176185def build_database (
177186 language : str , extractor_options , project : Project , project_dir : str
178- ) -> str | None :
187+ ) -> bool :
179188 """
180189 Build a CodeQL database for a project.
181190
@@ -186,12 +195,12 @@ def build_database(
186195 project_dir: Path to the CodeQL database.
187196
188197 Returns:
189- The path to the created database directory .
198+ True if the build was successful, False otherwise .
190199 """
191200 name = project ["name" ]
192201
193202 # Create database directory path
194- database_dir = build_dir / f" { name } -db"
203+ database_dir = database_dir_for_project ( name )
195204
196205 # Only build the database if it doesn't already exist
197206 if not database_dir .exists ():
@@ -214,13 +223,13 @@ def build_database(
214223 print (f"Successfully created database at { database_dir } " )
215224 except subprocess .CalledProcessError as e :
216225 print (f"Failed to create database for { name } : { e } " )
217- return None
226+ return False
218227 else :
219228 print (
220229 f"Skipping database creation for { name } as it already exists at { database_dir } "
221230 )
222231
223- return database_dir
232+ return True
224233
225234
226235def generate_models (config , args , project : Project , database_dir : str ) -> None :
@@ -251,7 +260,7 @@ def generate_models(config, args, project: Project, database_dir: str) -> None:
251260
252261def build_databases_from_projects (
253262 language : str , extractor_options , projects : List [Project ]
254- ) -> List [tuple [Project , str | None ]]:
263+ ) -> List [tuple [Project , bool ]]:
255264 """
256265 Build databases for all projects in parallel.
257266
@@ -261,7 +270,7 @@ def build_databases_from_projects(
261270 projects: List of projects to build databases for.
262271
263272 Returns:
264- List of (project_name, database_dir ) pairs, where database_dir is None if the build failed.
273+ List of (project_name, success ) pairs, where success is False if the build failed.
265274 """
266275 # Clone projects in parallel
267276 print ("=== Cloning projects ===" )
@@ -332,20 +341,22 @@ def download_dca_databases(
332341 language : str ,
333342 experiment_names : list [str ],
334343 pat : str ,
344+ reuse_databases : bool ,
335345 projects : List [Project ],
336- ) -> List [tuple [Project , str | None ]]:
346+ ) -> List [tuple [Project , bool ]]:
337347 """
338348 Download databases from a DCA experiment.
339349 Args:
340350 experiment_names: The names of the DCA experiments to download databases from.
341351 pat: Personal Access Token for GitHub API authentication.
342352 projects: List of projects to download databases for.
343353 Returns:
344- List of (project_name, database_dir ) pairs, where database_dir is None if the download failed.
354+ List of (project_name, success ) pairs, where success is False if the download failed.
345355 """
346356 print ("\n === Finding projects ===" )
347357 project_map = {project ["name" ]: project for project in projects }
348- analyzed_databases = {n : None for n in project_map }
358+
359+ analyzed_databases = {}
349360 for experiment_name in experiment_names :
350361 response = get_json_from_github (
351362 f"https://raw.githubusercontent.com/github/codeql-dca-main/data/{ experiment_name } /reports/downloads.json" ,
@@ -358,11 +369,11 @@ def download_dca_databases(
358369 artifact_name = analyzed_database ["artifact_name" ]
359370 pretty_name = pretty_name_from_artifact_name (artifact_name )
360371
361- if not pretty_name in analyzed_databases :
372+ if not pretty_name in project_map :
362373 print (f"Skipping { pretty_name } as it is not in the list of projects" )
363374 continue
364375
365- if analyzed_databases [ pretty_name ] is not None :
376+ if pretty_name in analyzed_databases :
366377 print (
367378 f"Skipping previous database { analyzed_databases [pretty_name ]['artifact_name' ]} for { pretty_name } "
368379 )
@@ -376,8 +387,9 @@ def download_dca_databases(
376387 )
377388 sys .exit (1 )
378389
379- def download_and_decompress (analyzed_database : dict ) -> str :
390+ def download_and_decompress (analyzed_database : dict ) -> bool :
380391 artifact_name = analyzed_database ["artifact_name" ]
392+ pretty_name = pretty_name_from_artifact_name (artifact_name )
381393 repository = analyzed_database ["repository" ]
382394 run_id = analyzed_database ["run_id" ]
383395 print (f"=== Finding artifact: { artifact_name } ===" )
@@ -407,15 +419,18 @@ def download_and_decompress(analyzed_database: dict) -> str:
407419 with tarfile .open (artifact_tar_location , "r:gz" ) as tar_ref :
408420 # And we just untar it to the same directory as the zip file
409421 tar_ref .extractall (artifact_unzipped_location )
410- ret = artifact_unzipped_location / language
411- print (f"Decompression complete: { ret } " )
412- return ret
422+ database_location = database_dir_for_project (pretty_name )
423+ # Move the database to the canonical location
424+ shutil .move (artifact_unzipped_location / language , database_location )
425+
426+ print (f"Decompression complete: { database_location } " )
427+ return True
413428
414429 results = run_in_parallel (
415430 download_and_decompress ,
416431 list (analyzed_databases .values ()),
417432 on_error = lambda db , exc : print (
418- f"ERROR: Failed to download and decompress { db [" artifact_name" ]} : { exc } "
433+ f"ERROR: Failed to download and decompress { db [' artifact_name' ]} : { exc } "
419434 ),
420435 error_summary = lambda failures : print (
421436 f"ERROR: Failed to download { len (failures )} databases: { ', ' .join (item [0 ] for item in failures )} "
@@ -460,6 +475,13 @@ def main(config, args) -> None:
460475 # Create build directory if it doesn't exist
461476 build_dir .mkdir (parents = True , exist_ok = True )
462477
478+ # Check if reusing databases is given and all databases exist
479+ reuse_databases = args .reuse_databases
480+ all_databases_exist = reuse_databases and
481+ all_exist = all (
482+ database_for_project_exists (project ["name" ]) for project in projects
483+ )
484+
463485 database_results = []
464486 match get_strategy (config ):
465487 case "repo" :
@@ -487,14 +509,15 @@ def main(config, args) -> None:
487509 language ,
488510 experiment_names ,
489511 pat ,
512+ args .reuse_databases ,
490513 projects ,
491514 )
492515
493516 # Generate models for all projects
494517 print ("\n === Generating models ===" )
495518
496519 failed_builds = [
497- project ["name" ] for project , db_dir in database_results if db_dir is None
520+ project ["name" ] for project , success in database_results if not success
498521 ]
499522 if failed_builds :
500523 print (
@@ -506,8 +529,9 @@ def main(config, args) -> None:
506529 for project , _ in database_results :
507530 clean_up_mad_destination_for_project (config , project ["name" ])
508531
509- for project , database_dir in database_results :
510- if database_dir is not None :
532+ for project , success in database_results :
533+ database_dir = database_dir_for_project (project ["name" ])
534+ if success :
511535 generate_models (config , args , project , database_dir )
512536
513537
@@ -543,6 +567,12 @@ def main(config, args) -> None:
543567 help = "What `--threads` value to pass to `codeql` (default %(default)s)" ,
544568 default = 0 ,
545569 )
570+ parser .add_argument (
571+ "--reuse-databases" ,
572+ type = bool ,
573+ help = "Whether to reuse existing databases instead of rebuilding them" ,
574+ default = False ,
575+ )
546576 args = parser .parse_args ()
547577
548578 # Load config file
0 commit comments