diff --git a/checkpoints/codet5p-110m-embedding-csn/adalora-csn-go-256/README.md b/checkpoints/codet5p-110m-embedding-csn/adalora-csn-go-256/README.md new file mode 100644 index 0000000..dcce457 --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/adalora-csn-go-256/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.5.0 diff --git a/checkpoints/codet5p-110m-embedding-csn/adalora-csn-go-256/adapter_config.json b/checkpoints/codet5p-110m-embedding-csn/adalora-csn-go-256/adapter_config.json new file mode 100644 index 0000000..6624204 --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/adalora-csn-go-256/adapter_config.json @@ -0,0 +1,34 @@ +{ + "auto_mapping": { + "base_model_class": "CodeT5pEmbeddingModel", + "parent_library": "transformers_modules.Salesforce.codet5p-110m-embedding.94f88f95672b1d4b0cc715c6011001a74f892bdd.modeling_codet5p_embedding" + }, + "base_model_name_or_path": "Salesforce/codet5p-110m-embedding", + "beta1": 0.85, + "beta2": 0.85, + "bias": "none", + "deltaT": 1, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "init_r": 12, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32, + "lora_dropout": 0.1, + "modules_to_save": null, + "orth_reg_weight": 0.5, + "peft_type": "ADALORA", + "r": 8, + "rank_pattern": null, + "revision": null, + "target_modules": [ + "q", + "v" + ], + "target_r": 8, + "task_type": null, + "tfinal": 0, + "tinit": 0, + "total_step": null +} \ No newline at end of file diff --git a/checkpoints/codet5p-110m-embedding-csn/adalora-csn-go-256/adapter_model.bin b/checkpoints/codet5p-110m-embedding-csn/adalora-csn-go-256/adapter_model.bin new file mode 100644 index 0000000..eda14db Binary files /dev/null and b/checkpoints/codet5p-110m-embedding-csn/adalora-csn-go-256/adapter_model.bin differ diff --git a/checkpoints/codet5p-110m-embedding-csn/adalora-csn-go/README.md b/checkpoints/codet5p-110m-embedding-csn/adalora-csn-go/README.md new file mode 100644 index 0000000..65c9e3b --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/adalora-csn-go/README.md @@ -0,0 +1,10 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + +- PEFT 0.5.0 + +- PEFT 0.5.0 diff --git a/checkpoints/codet5p-110m-embedding-csn/adalora-csn-go/adapter_config.json b/checkpoints/codet5p-110m-embedding-csn/adalora-csn-go/adapter_config.json new file mode 100644 index 0000000..6624204 --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/adalora-csn-go/adapter_config.json @@ -0,0 +1,34 @@ +{ + "auto_mapping": { + "base_model_class": "CodeT5pEmbeddingModel", + "parent_library": "transformers_modules.Salesforce.codet5p-110m-embedding.94f88f95672b1d4b0cc715c6011001a74f892bdd.modeling_codet5p_embedding" + }, + "base_model_name_or_path": "Salesforce/codet5p-110m-embedding", + "beta1": 0.85, + "beta2": 0.85, + "bias": "none", + "deltaT": 1, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "init_r": 12, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32, + "lora_dropout": 0.1, + "modules_to_save": null, + "orth_reg_weight": 0.5, + "peft_type": "ADALORA", + "r": 8, + "rank_pattern": null, + "revision": null, + "target_modules": [ + "q", + "v" + ], + "target_r": 8, + "task_type": null, + "tfinal": 0, + "tinit": 0, + "total_step": null +} \ No newline at end of file diff --git a/checkpoints/codet5p-110m-embedding-csn/adalora-csn-go/adapter_model.bin b/checkpoints/codet5p-110m-embedding-csn/adalora-csn-go/adapter_model.bin new file mode 100644 index 0000000..b677222 Binary files /dev/null and b/checkpoints/codet5p-110m-embedding-csn/adalora-csn-go/adapter_model.bin differ diff --git a/checkpoints/codet5p-110m-embedding-csn/adalora-csn-java-256/README.md b/checkpoints/codet5p-110m-embedding-csn/adalora-csn-java-256/README.md new file mode 100644 index 0000000..75ccb89 --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/adalora-csn-java-256/README.md @@ -0,0 +1,12 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 + +- PEFT 0.5.0 diff --git a/checkpoints/codet5p-110m-embedding-csn/adalora-csn-java-256/adapter_config.json b/checkpoints/codet5p-110m-embedding-csn/adalora-csn-java-256/adapter_config.json new file mode 100644 index 0000000..6624204 --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/adalora-csn-java-256/adapter_config.json @@ -0,0 +1,34 @@ +{ + "auto_mapping": { + "base_model_class": "CodeT5pEmbeddingModel", + "parent_library": "transformers_modules.Salesforce.codet5p-110m-embedding.94f88f95672b1d4b0cc715c6011001a74f892bdd.modeling_codet5p_embedding" + }, + "base_model_name_or_path": "Salesforce/codet5p-110m-embedding", + "beta1": 0.85, + "beta2": 0.85, + "bias": "none", + "deltaT": 1, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "init_r": 12, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32, + "lora_dropout": 0.1, + "modules_to_save": null, + "orth_reg_weight": 0.5, + "peft_type": "ADALORA", + "r": 8, + "rank_pattern": null, + "revision": null, + "target_modules": [ + "q", + "v" + ], + "target_r": 8, + "task_type": null, + "tfinal": 0, + "tinit": 0, + "total_step": null +} \ No newline at end of file diff --git a/checkpoints/codet5p-110m-embedding-csn/adalora-csn-java-256/adapter_model.bin b/checkpoints/codet5p-110m-embedding-csn/adalora-csn-java-256/adapter_model.bin new file mode 100644 index 0000000..0de731e Binary files /dev/null and b/checkpoints/codet5p-110m-embedding-csn/adalora-csn-java-256/adapter_model.bin differ diff --git a/checkpoints/codet5p-110m-embedding-csn/adalora-csn-javascript/README.md b/checkpoints/codet5p-110m-embedding-csn/adalora-csn-javascript/README.md new file mode 100644 index 0000000..65c9e3b --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/adalora-csn-javascript/README.md @@ -0,0 +1,10 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + +- PEFT 0.5.0 + +- PEFT 0.5.0 diff --git a/checkpoints/codet5p-110m-embedding-csn/adalora-csn-javascript/adapter_config.json b/checkpoints/codet5p-110m-embedding-csn/adalora-csn-javascript/adapter_config.json new file mode 100644 index 0000000..6624204 --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/adalora-csn-javascript/adapter_config.json @@ -0,0 +1,34 @@ +{ + "auto_mapping": { + "base_model_class": "CodeT5pEmbeddingModel", + "parent_library": "transformers_modules.Salesforce.codet5p-110m-embedding.94f88f95672b1d4b0cc715c6011001a74f892bdd.modeling_codet5p_embedding" + }, + "base_model_name_or_path": "Salesforce/codet5p-110m-embedding", + "beta1": 0.85, + "beta2": 0.85, + "bias": "none", + "deltaT": 1, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "init_r": 12, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32, + "lora_dropout": 0.1, + "modules_to_save": null, + "orth_reg_weight": 0.5, + "peft_type": "ADALORA", + "r": 8, + "rank_pattern": null, + "revision": null, + "target_modules": [ + "q", + "v" + ], + "target_r": 8, + "task_type": null, + "tfinal": 0, + "tinit": 0, + "total_step": null +} \ No newline at end of file diff --git a/checkpoints/codet5p-110m-embedding-csn/adalora-csn-javascript/adapter_model.bin b/checkpoints/codet5p-110m-embedding-csn/adalora-csn-javascript/adapter_model.bin new file mode 100644 index 0000000..f3befa4 Binary files /dev/null and b/checkpoints/codet5p-110m-embedding-csn/adalora-csn-javascript/adapter_model.bin differ diff --git a/checkpoints/codet5p-110m-embedding-csn/adalora-csn-php-256/README.md b/checkpoints/codet5p-110m-embedding-csn/adalora-csn-php-256/README.md new file mode 100644 index 0000000..4f7c443 --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/adalora-csn-php-256/README.md @@ -0,0 +1,11 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + +- PEFT 0.5.0 +- PEFT 0.5.0 + +- PEFT 0.5.0 diff --git a/checkpoints/codet5p-110m-embedding-csn/adalora-csn-php-256/adapter_config.json b/checkpoints/codet5p-110m-embedding-csn/adalora-csn-php-256/adapter_config.json new file mode 100644 index 0000000..6624204 --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/adalora-csn-php-256/adapter_config.json @@ -0,0 +1,34 @@ +{ + "auto_mapping": { + "base_model_class": "CodeT5pEmbeddingModel", + "parent_library": "transformers_modules.Salesforce.codet5p-110m-embedding.94f88f95672b1d4b0cc715c6011001a74f892bdd.modeling_codet5p_embedding" + }, + "base_model_name_or_path": "Salesforce/codet5p-110m-embedding", + "beta1": 0.85, + "beta2": 0.85, + "bias": "none", + "deltaT": 1, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "init_r": 12, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32, + "lora_dropout": 0.1, + "modules_to_save": null, + "orth_reg_weight": 0.5, + "peft_type": "ADALORA", + "r": 8, + "rank_pattern": null, + "revision": null, + "target_modules": [ + "q", + "v" + ], + "target_r": 8, + "task_type": null, + "tfinal": 0, + "tinit": 0, + "total_step": null +} \ No newline at end of file diff --git a/checkpoints/codet5p-110m-embedding-csn/adalora-csn-php-256/adapter_model.bin b/checkpoints/codet5p-110m-embedding-csn/adalora-csn-php-256/adapter_model.bin new file mode 100644 index 0000000..0ce8c11 Binary files /dev/null and b/checkpoints/codet5p-110m-embedding-csn/adalora-csn-php-256/adapter_model.bin differ diff --git a/checkpoints/codet5p-110m-embedding-csn/adalora-csn-ruby/README.md b/checkpoints/codet5p-110m-embedding-csn/adalora-csn-ruby/README.md new file mode 100644 index 0000000..dcce457 --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/adalora-csn-ruby/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.5.0 diff --git a/checkpoints/codet5p-110m-embedding-csn/adalora-csn-ruby/adapter_config.json b/checkpoints/codet5p-110m-embedding-csn/adalora-csn-ruby/adapter_config.json new file mode 100644 index 0000000..6624204 --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/adalora-csn-ruby/adapter_config.json @@ -0,0 +1,34 @@ +{ + "auto_mapping": { + "base_model_class": "CodeT5pEmbeddingModel", + "parent_library": "transformers_modules.Salesforce.codet5p-110m-embedding.94f88f95672b1d4b0cc715c6011001a74f892bdd.modeling_codet5p_embedding" + }, + "base_model_name_or_path": "Salesforce/codet5p-110m-embedding", + "beta1": 0.85, + "beta2": 0.85, + "bias": "none", + "deltaT": 1, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "init_r": 12, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32, + "lora_dropout": 0.1, + "modules_to_save": null, + "orth_reg_weight": 0.5, + "peft_type": "ADALORA", + "r": 8, + "rank_pattern": null, + "revision": null, + "target_modules": [ + "q", + "v" + ], + "target_r": 8, + "task_type": null, + "tfinal": 0, + "tinit": 0, + "total_step": null +} \ No newline at end of file diff --git a/checkpoints/codet5p-110m-embedding-csn/adalora-csn-ruby/adapter_model.bin b/checkpoints/codet5p-110m-embedding-csn/adalora-csn-ruby/adapter_model.bin new file mode 100644 index 0000000..db167b6 Binary files /dev/null and b/checkpoints/codet5p-110m-embedding-csn/adalora-csn-ruby/adapter_model.bin differ diff --git a/checkpoints/codet5p-110m-embedding-csn/ia3-csn-go-256/README.md b/checkpoints/codet5p-110m-embedding-csn/ia3-csn-go-256/README.md new file mode 100644 index 0000000..3a7f0b6 --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/ia3-csn-go-256/README.md @@ -0,0 +1,13 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 + +- PEFT 0.5.0 diff --git a/checkpoints/codet5p-110m-embedding-csn/ia3-csn-go-256/adapter_config.json b/checkpoints/codet5p-110m-embedding-csn/ia3-csn-go-256/adapter_config.json new file mode 100644 index 0000000..0351d12 --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/ia3-csn-go-256/adapter_config.json @@ -0,0 +1,22 @@ +{ + "auto_mapping": { + "base_model_class": "CodeT5pEmbeddingModel", + "parent_library": "transformers_modules.Salesforce.codet5p-110m-embedding.94f88f95672b1d4b0cc715c6011001a74f892bdd.modeling_codet5p_embedding" + }, + "base_model_name_or_path": "Salesforce/codet5p-110m-embedding", + "fan_in_fan_out": false, + "feedforward_modules": [ + "o" + ], + "inference_mode": true, + "init_ia3_weights": true, + "modules_to_save": null, + "peft_type": "IA3", + "revision": null, + "target_modules": [ + "q", + "v", + "o" + ], + "task_type": null +} \ No newline at end of file diff --git a/checkpoints/codet5p-110m-embedding-csn/ia3-csn-go-256/adapter_model.bin b/checkpoints/codet5p-110m-embedding-csn/ia3-csn-go-256/adapter_model.bin new file mode 100644 index 0000000..db2fe42 Binary files /dev/null and b/checkpoints/codet5p-110m-embedding-csn/ia3-csn-go-256/adapter_model.bin differ diff --git a/checkpoints/codet5p-110m-embedding-csn/ia3-csn-java/README.md b/checkpoints/codet5p-110m-embedding-csn/ia3-csn-java/README.md new file mode 100644 index 0000000..3a7f0b6 --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/ia3-csn-java/README.md @@ -0,0 +1,13 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 + +- PEFT 0.5.0 diff --git a/checkpoints/codet5p-110m-embedding-csn/ia3-csn-java/adapter_config.json b/checkpoints/codet5p-110m-embedding-csn/ia3-csn-java/adapter_config.json new file mode 100644 index 0000000..0351d12 --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/ia3-csn-java/adapter_config.json @@ -0,0 +1,22 @@ +{ + "auto_mapping": { + "base_model_class": "CodeT5pEmbeddingModel", + "parent_library": "transformers_modules.Salesforce.codet5p-110m-embedding.94f88f95672b1d4b0cc715c6011001a74f892bdd.modeling_codet5p_embedding" + }, + "base_model_name_or_path": "Salesforce/codet5p-110m-embedding", + "fan_in_fan_out": false, + "feedforward_modules": [ + "o" + ], + "inference_mode": true, + "init_ia3_weights": true, + "modules_to_save": null, + "peft_type": "IA3", + "revision": null, + "target_modules": [ + "q", + "v", + "o" + ], + "task_type": null +} \ No newline at end of file diff --git a/checkpoints/codet5p-110m-embedding-csn/ia3-csn-java/adapter_model.bin b/checkpoints/codet5p-110m-embedding-csn/ia3-csn-java/adapter_model.bin new file mode 100644 index 0000000..40c3846 Binary files /dev/null and b/checkpoints/codet5p-110m-embedding-csn/ia3-csn-java/adapter_model.bin differ diff --git a/checkpoints/codet5p-110m-embedding-csn/ia3-csn-javascript/README.md b/checkpoints/codet5p-110m-embedding-csn/ia3-csn-javascript/README.md new file mode 100644 index 0000000..4f7c443 --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/ia3-csn-javascript/README.md @@ -0,0 +1,11 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + +- PEFT 0.5.0 +- PEFT 0.5.0 + +- PEFT 0.5.0 diff --git a/checkpoints/codet5p-110m-embedding-csn/ia3-csn-javascript/adapter_config.json b/checkpoints/codet5p-110m-embedding-csn/ia3-csn-javascript/adapter_config.json new file mode 100644 index 0000000..0351d12 --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/ia3-csn-javascript/adapter_config.json @@ -0,0 +1,22 @@ +{ + "auto_mapping": { + "base_model_class": "CodeT5pEmbeddingModel", + "parent_library": "transformers_modules.Salesforce.codet5p-110m-embedding.94f88f95672b1d4b0cc715c6011001a74f892bdd.modeling_codet5p_embedding" + }, + "base_model_name_or_path": "Salesforce/codet5p-110m-embedding", + "fan_in_fan_out": false, + "feedforward_modules": [ + "o" + ], + "inference_mode": true, + "init_ia3_weights": true, + "modules_to_save": null, + "peft_type": "IA3", + "revision": null, + "target_modules": [ + "q", + "v", + "o" + ], + "task_type": null +} \ No newline at end of file diff --git a/checkpoints/codet5p-110m-embedding-csn/ia3-csn-javascript/adapter_model.bin b/checkpoints/codet5p-110m-embedding-csn/ia3-csn-javascript/adapter_model.bin new file mode 100644 index 0000000..f282f93 Binary files /dev/null and b/checkpoints/codet5p-110m-embedding-csn/ia3-csn-javascript/adapter_model.bin differ diff --git a/checkpoints/codet5p-110m-embedding-csn/ia3-csn-php-256/README.md b/checkpoints/codet5p-110m-embedding-csn/ia3-csn-php-256/README.md new file mode 100644 index 0000000..3a7f0b6 --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/ia3-csn-php-256/README.md @@ -0,0 +1,13 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 + +- PEFT 0.5.0 diff --git a/checkpoints/codet5p-110m-embedding-csn/ia3-csn-php-256/adapter_config.json b/checkpoints/codet5p-110m-embedding-csn/ia3-csn-php-256/adapter_config.json new file mode 100644 index 0000000..0351d12 --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/ia3-csn-php-256/adapter_config.json @@ -0,0 +1,22 @@ +{ + "auto_mapping": { + "base_model_class": "CodeT5pEmbeddingModel", + "parent_library": "transformers_modules.Salesforce.codet5p-110m-embedding.94f88f95672b1d4b0cc715c6011001a74f892bdd.modeling_codet5p_embedding" + }, + "base_model_name_or_path": "Salesforce/codet5p-110m-embedding", + "fan_in_fan_out": false, + "feedforward_modules": [ + "o" + ], + "inference_mode": true, + "init_ia3_weights": true, + "modules_to_save": null, + "peft_type": "IA3", + "revision": null, + "target_modules": [ + "q", + "v", + "o" + ], + "task_type": null +} \ No newline at end of file diff --git a/checkpoints/codet5p-110m-embedding-csn/ia3-csn-php-256/adapter_model.bin b/checkpoints/codet5p-110m-embedding-csn/ia3-csn-php-256/adapter_model.bin new file mode 100644 index 0000000..bed90ac Binary files /dev/null and b/checkpoints/codet5p-110m-embedding-csn/ia3-csn-php-256/adapter_model.bin differ diff --git a/checkpoints/codet5p-110m-embedding-csn/ia3-csn-ruby/README.md b/checkpoints/codet5p-110m-embedding-csn/ia3-csn-ruby/README.md new file mode 100644 index 0000000..75ccb89 --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/ia3-csn-ruby/README.md @@ -0,0 +1,12 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 + +- PEFT 0.5.0 diff --git a/checkpoints/codet5p-110m-embedding-csn/ia3-csn-ruby/adapter_config.json b/checkpoints/codet5p-110m-embedding-csn/ia3-csn-ruby/adapter_config.json new file mode 100644 index 0000000..0351d12 --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/ia3-csn-ruby/adapter_config.json @@ -0,0 +1,22 @@ +{ + "auto_mapping": { + "base_model_class": "CodeT5pEmbeddingModel", + "parent_library": "transformers_modules.Salesforce.codet5p-110m-embedding.94f88f95672b1d4b0cc715c6011001a74f892bdd.modeling_codet5p_embedding" + }, + "base_model_name_or_path": "Salesforce/codet5p-110m-embedding", + "fan_in_fan_out": false, + "feedforward_modules": [ + "o" + ], + "inference_mode": true, + "init_ia3_weights": true, + "modules_to_save": null, + "peft_type": "IA3", + "revision": null, + "target_modules": [ + "q", + "v", + "o" + ], + "task_type": null +} \ No newline at end of file diff --git a/checkpoints/codet5p-110m-embedding-csn/ia3-csn-ruby/adapter_model.bin b/checkpoints/codet5p-110m-embedding-csn/ia3-csn-ruby/adapter_model.bin new file mode 100644 index 0000000..afc4842 Binary files /dev/null and b/checkpoints/codet5p-110m-embedding-csn/ia3-csn-ruby/adapter_model.bin differ diff --git a/checkpoints/codet5p-110m-embedding-csn/lora-csn-go/README.md b/checkpoints/codet5p-110m-embedding-csn/lora-csn-go/README.md new file mode 100644 index 0000000..65c9e3b --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/lora-csn-go/README.md @@ -0,0 +1,10 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + +- PEFT 0.5.0 + +- PEFT 0.5.0 diff --git a/checkpoints/codet5p-110m-embedding-csn/lora-csn-go/adapter_config.json b/checkpoints/codet5p-110m-embedding-csn/lora-csn-go/adapter_config.json new file mode 100644 index 0000000..72e415f --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/lora-csn-go/adapter_config.json @@ -0,0 +1,24 @@ +{ + "auto_mapping": { + "base_model_class": "CodeT5pEmbeddingModel", + "parent_library": "transformers_modules.Salesforce.codet5p-110m-embedding.94f88f95672b1d4b0cc715c6011001a74f892bdd.modeling_codet5p_embedding" + }, + "base_model_name_or_path": "Salesforce/codet5p-110m-embedding", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q", + "v" + ], + "task_type": null +} \ No newline at end of file diff --git a/checkpoints/codet5p-110m-embedding-csn/lora-csn-go/adapter_model.bin b/checkpoints/codet5p-110m-embedding-csn/lora-csn-go/adapter_model.bin new file mode 100644 index 0000000..a8d1c5e Binary files /dev/null and b/checkpoints/codet5p-110m-embedding-csn/lora-csn-go/adapter_model.bin differ diff --git a/checkpoints/codet5p-110m-embedding-csn/lora-csn-java/README.md b/checkpoints/codet5p-110m-embedding-csn/lora-csn-java/README.md new file mode 100644 index 0000000..dcce457 --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/lora-csn-java/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.5.0 diff --git a/checkpoints/codet5p-110m-embedding-csn/lora-csn-java/adapter_config.json b/checkpoints/codet5p-110m-embedding-csn/lora-csn-java/adapter_config.json new file mode 100644 index 0000000..72e415f --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/lora-csn-java/adapter_config.json @@ -0,0 +1,24 @@ +{ + "auto_mapping": { + "base_model_class": "CodeT5pEmbeddingModel", + "parent_library": "transformers_modules.Salesforce.codet5p-110m-embedding.94f88f95672b1d4b0cc715c6011001a74f892bdd.modeling_codet5p_embedding" + }, + "base_model_name_or_path": "Salesforce/codet5p-110m-embedding", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q", + "v" + ], + "task_type": null +} \ No newline at end of file diff --git a/checkpoints/codet5p-110m-embedding-csn/lora-csn-java/adapter_model.bin b/checkpoints/codet5p-110m-embedding-csn/lora-csn-java/adapter_model.bin new file mode 100644 index 0000000..7476a84 Binary files /dev/null and b/checkpoints/codet5p-110m-embedding-csn/lora-csn-java/adapter_model.bin differ diff --git a/checkpoints/codet5p-110m-embedding-csn/lora-csn-javascript/README.md b/checkpoints/codet5p-110m-embedding-csn/lora-csn-javascript/README.md new file mode 100644 index 0000000..65c9e3b --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/lora-csn-javascript/README.md @@ -0,0 +1,10 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + +- PEFT 0.5.0 + +- PEFT 0.5.0 diff --git a/checkpoints/codet5p-110m-embedding-csn/lora-csn-javascript/adapter_config.json b/checkpoints/codet5p-110m-embedding-csn/lora-csn-javascript/adapter_config.json new file mode 100644 index 0000000..72e415f --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/lora-csn-javascript/adapter_config.json @@ -0,0 +1,24 @@ +{ + "auto_mapping": { + "base_model_class": "CodeT5pEmbeddingModel", + "parent_library": "transformers_modules.Salesforce.codet5p-110m-embedding.94f88f95672b1d4b0cc715c6011001a74f892bdd.modeling_codet5p_embedding" + }, + "base_model_name_or_path": "Salesforce/codet5p-110m-embedding", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q", + "v" + ], + "task_type": null +} \ No newline at end of file diff --git a/checkpoints/codet5p-110m-embedding-csn/lora-csn-javascript/adapter_model.bin b/checkpoints/codet5p-110m-embedding-csn/lora-csn-javascript/adapter_model.bin new file mode 100644 index 0000000..9709596 Binary files /dev/null and b/checkpoints/codet5p-110m-embedding-csn/lora-csn-javascript/adapter_model.bin differ diff --git a/checkpoints/codet5p-110m-embedding-csn/lora-csn-ruby/README.md b/checkpoints/codet5p-110m-embedding-csn/lora-csn-ruby/README.md new file mode 100644 index 0000000..dcce457 --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/lora-csn-ruby/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.5.0 diff --git a/checkpoints/codet5p-110m-embedding-csn/lora-csn-ruby/adapter_config.json b/checkpoints/codet5p-110m-embedding-csn/lora-csn-ruby/adapter_config.json new file mode 100644 index 0000000..72e415f --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/lora-csn-ruby/adapter_config.json @@ -0,0 +1,24 @@ +{ + "auto_mapping": { + "base_model_class": "CodeT5pEmbeddingModel", + "parent_library": "transformers_modules.Salesforce.codet5p-110m-embedding.94f88f95672b1d4b0cc715c6011001a74f892bdd.modeling_codet5p_embedding" + }, + "base_model_name_or_path": "Salesforce/codet5p-110m-embedding", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q", + "v" + ], + "task_type": null +} \ No newline at end of file diff --git a/checkpoints/codet5p-110m-embedding-csn/lora-csn-ruby/adapter_model.bin b/checkpoints/codet5p-110m-embedding-csn/lora-csn-ruby/adapter_model.bin new file mode 100644 index 0000000..12d0759 Binary files /dev/null and b/checkpoints/codet5p-110m-embedding-csn/lora-csn-ruby/adapter_model.bin differ diff --git a/checkpoints/codet5p-110m-embedding-csn/lora_php_256/README.md b/checkpoints/codet5p-110m-embedding-csn/lora_php_256/README.md new file mode 100644 index 0000000..dcce457 --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/lora_php_256/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.5.0 diff --git a/checkpoints/codet5p-110m-embedding-csn/lora_php_256/adapter_config.json b/checkpoints/codet5p-110m-embedding-csn/lora_php_256/adapter_config.json new file mode 100644 index 0000000..72e415f --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/lora_php_256/adapter_config.json @@ -0,0 +1,24 @@ +{ + "auto_mapping": { + "base_model_class": "CodeT5pEmbeddingModel", + "parent_library": "transformers_modules.Salesforce.codet5p-110m-embedding.94f88f95672b1d4b0cc715c6011001a74f892bdd.modeling_codet5p_embedding" + }, + "base_model_name_or_path": "Salesforce/codet5p-110m-embedding", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q", + "v" + ], + "task_type": null +} \ No newline at end of file diff --git a/checkpoints/codet5p-110m-embedding-csn/lora_php_256/adapter_model.bin b/checkpoints/codet5p-110m-embedding-csn/lora_php_256/adapter_model.bin new file mode 100644 index 0000000..d471823 Binary files /dev/null and b/checkpoints/codet5p-110m-embedding-csn/lora_php_256/adapter_model.bin differ diff --git a/checkpoints/codet5p-110m-embedding-csn/prompt-csn-go-256/README.md b/checkpoints/codet5p-110m-embedding-csn/prompt-csn-go-256/README.md new file mode 100644 index 0000000..3a7f0b6 --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/prompt-csn-go-256/README.md @@ -0,0 +1,13 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 + +- PEFT 0.5.0 diff --git a/checkpoints/codet5p-110m-embedding-csn/prompt-csn-go-256/adapter_config.json b/checkpoints/codet5p-110m-embedding-csn/prompt-csn-go-256/adapter_config.json new file mode 100644 index 0000000..310f581 --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/prompt-csn-go-256/adapter_config.json @@ -0,0 +1,16 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "Salesforce/codet5p-110m-embedding", + "inference_mode": true, + "num_attention_heads": 12, + "num_layers": 12, + "num_transformer_submodules": 1, + "num_virtual_tokens": 10, + "peft_type": "PROMPT_TUNING", + "prompt_tuning_init": "RANDOM", + "prompt_tuning_init_text": null, + "revision": null, + "task_type": "FEATURE_EXTRACTION", + "token_dim": 768, + "tokenizer_name_or_path": null +} \ No newline at end of file diff --git a/checkpoints/codet5p-110m-embedding-csn/prompt-csn-go-256/adapter_model.bin b/checkpoints/codet5p-110m-embedding-csn/prompt-csn-go-256/adapter_model.bin new file mode 100644 index 0000000..4a09643 Binary files /dev/null and b/checkpoints/codet5p-110m-embedding-csn/prompt-csn-go-256/adapter_model.bin differ diff --git a/checkpoints/codet5p-110m-embedding-csn/prompt-csn-java/README.md b/checkpoints/codet5p-110m-embedding-csn/prompt-csn-java/README.md new file mode 100644 index 0000000..4f7c443 --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/prompt-csn-java/README.md @@ -0,0 +1,11 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + +- PEFT 0.5.0 +- PEFT 0.5.0 + +- PEFT 0.5.0 diff --git a/checkpoints/codet5p-110m-embedding-csn/prompt-csn-java/adapter_config.json b/checkpoints/codet5p-110m-embedding-csn/prompt-csn-java/adapter_config.json new file mode 100644 index 0000000..310f581 --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/prompt-csn-java/adapter_config.json @@ -0,0 +1,16 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "Salesforce/codet5p-110m-embedding", + "inference_mode": true, + "num_attention_heads": 12, + "num_layers": 12, + "num_transformer_submodules": 1, + "num_virtual_tokens": 10, + "peft_type": "PROMPT_TUNING", + "prompt_tuning_init": "RANDOM", + "prompt_tuning_init_text": null, + "revision": null, + "task_type": "FEATURE_EXTRACTION", + "token_dim": 768, + "tokenizer_name_or_path": null +} \ No newline at end of file diff --git a/checkpoints/codet5p-110m-embedding-csn/prompt-csn-java/adapter_model.bin b/checkpoints/codet5p-110m-embedding-csn/prompt-csn-java/adapter_model.bin new file mode 100644 index 0000000..312fa70 Binary files /dev/null and b/checkpoints/codet5p-110m-embedding-csn/prompt-csn-java/adapter_model.bin differ diff --git a/checkpoints/codet5p-110m-embedding-csn/prompt-csn-javascript/README.md b/checkpoints/codet5p-110m-embedding-csn/prompt-csn-javascript/README.md new file mode 100644 index 0000000..3a7f0b6 --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/prompt-csn-javascript/README.md @@ -0,0 +1,13 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 + +- PEFT 0.5.0 diff --git a/checkpoints/codet5p-110m-embedding-csn/prompt-csn-javascript/adapter_config.json b/checkpoints/codet5p-110m-embedding-csn/prompt-csn-javascript/adapter_config.json new file mode 100644 index 0000000..310f581 --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/prompt-csn-javascript/adapter_config.json @@ -0,0 +1,16 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "Salesforce/codet5p-110m-embedding", + "inference_mode": true, + "num_attention_heads": 12, + "num_layers": 12, + "num_transformer_submodules": 1, + "num_virtual_tokens": 10, + "peft_type": "PROMPT_TUNING", + "prompt_tuning_init": "RANDOM", + "prompt_tuning_init_text": null, + "revision": null, + "task_type": "FEATURE_EXTRACTION", + "token_dim": 768, + "tokenizer_name_or_path": null +} \ No newline at end of file diff --git a/checkpoints/codet5p-110m-embedding-csn/prompt-csn-javascript/adapter_model.bin b/checkpoints/codet5p-110m-embedding-csn/prompt-csn-javascript/adapter_model.bin new file mode 100644 index 0000000..56fbe42 Binary files /dev/null and b/checkpoints/codet5p-110m-embedding-csn/prompt-csn-javascript/adapter_model.bin differ diff --git a/checkpoints/codet5p-110m-embedding-csn/prompt-csn-php-256/README.md b/checkpoints/codet5p-110m-embedding-csn/prompt-csn-php-256/README.md new file mode 100644 index 0000000..3a7f0b6 --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/prompt-csn-php-256/README.md @@ -0,0 +1,13 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 + +- PEFT 0.5.0 diff --git a/checkpoints/codet5p-110m-embedding-csn/prompt-csn-php-256/adapter_config.json b/checkpoints/codet5p-110m-embedding-csn/prompt-csn-php-256/adapter_config.json new file mode 100644 index 0000000..310f581 --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/prompt-csn-php-256/adapter_config.json @@ -0,0 +1,16 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "Salesforce/codet5p-110m-embedding", + "inference_mode": true, + "num_attention_heads": 12, + "num_layers": 12, + "num_transformer_submodules": 1, + "num_virtual_tokens": 10, + "peft_type": "PROMPT_TUNING", + "prompt_tuning_init": "RANDOM", + "prompt_tuning_init_text": null, + "revision": null, + "task_type": "FEATURE_EXTRACTION", + "token_dim": 768, + "tokenizer_name_or_path": null +} \ No newline at end of file diff --git a/checkpoints/codet5p-110m-embedding-csn/prompt-csn-php-256/adapter_model.bin b/checkpoints/codet5p-110m-embedding-csn/prompt-csn-php-256/adapter_model.bin new file mode 100644 index 0000000..9804d06 Binary files /dev/null and b/checkpoints/codet5p-110m-embedding-csn/prompt-csn-php-256/adapter_model.bin differ diff --git a/checkpoints/codet5p-110m-embedding-csn/prompt-csn-ruby/README.md b/checkpoints/codet5p-110m-embedding-csn/prompt-csn-ruby/README.md new file mode 100644 index 0000000..3a7f0b6 --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/prompt-csn-ruby/README.md @@ -0,0 +1,13 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 +- PEFT 0.5.0 + +- PEFT 0.5.0 diff --git a/checkpoints/codet5p-110m-embedding-csn/prompt-csn-ruby/adapter_config.json b/checkpoints/codet5p-110m-embedding-csn/prompt-csn-ruby/adapter_config.json new file mode 100644 index 0000000..310f581 --- /dev/null +++ b/checkpoints/codet5p-110m-embedding-csn/prompt-csn-ruby/adapter_config.json @@ -0,0 +1,16 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "Salesforce/codet5p-110m-embedding", + "inference_mode": true, + "num_attention_heads": 12, + "num_layers": 12, + "num_transformer_submodules": 1, + "num_virtual_tokens": 10, + "peft_type": "PROMPT_TUNING", + "prompt_tuning_init": "RANDOM", + "prompt_tuning_init_text": null, + "revision": null, + "task_type": "FEATURE_EXTRACTION", + "token_dim": 768, + "tokenizer_name_or_path": null +} \ No newline at end of file diff --git a/checkpoints/codet5p-110m-embedding-csn/prompt-csn-ruby/adapter_model.bin b/checkpoints/codet5p-110m-embedding-csn/prompt-csn-ruby/adapter_model.bin new file mode 100644 index 0000000..35abec8 Binary files /dev/null and b/checkpoints/codet5p-110m-embedding-csn/prompt-csn-ruby/adapter_model.bin differ diff --git a/requirements.txt b/requirements.txt index ebec24d..1b4f4df 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,20 @@ transformers peft evaluate rouge-score -fire -loguru \ No newline at end of file +nltk +fire~=0.4.0 +loguru~=0.7.2 +langchain>=0.1.0 +faiss-cpu==1.7.4 +nbformat +numpy~=1.23.4 +pydantic~=2.6.3 +tqdm~=4.64.1 +pandas~=1.5.1 +scikit-learn~=1.3.2 +requests~=2.28.1 +annoy~=1.17.3 +pytest~=7.4.1 +streamlit~=1.12.0 +seaborn~=0.12.1 +matplotlib~=3.8.2 \ No newline at end of file diff --git a/src/datasets/CodeSearchNet/make_dataset.py b/src/datasets/CodeSearchNet/make_dataset.py new file mode 100644 index 0000000..8751510 --- /dev/null +++ b/src/datasets/CodeSearchNet/make_dataset.py @@ -0,0 +1,27 @@ +from datasets import load_dataset +from loguru import logger + +MODES = ['train', 'validation', 'test'] +LANGUAGES = ['python', 'go', 'java', 'javascript', 'ruby', 'php'] +COLUMNS_RENAMING = {'func_code_string': 'code_tokens', 'func_documentation_string': 'summary'} + + +class CodeSearchNet: + + def __init__(self, mode, language): + assert mode in MODES, f'Unsupported mode {mode}' + assert language in LANGUAGES, f'Unsupported language {language}' + + self.language = language + self.mode = mode + + self.dataframe = load_dataset("code_search_net", language)[mode].to_pandas() + self.dataframe.drop([column for column in self.dataframe.columns if column not in COLUMNS_RENAMING], + axis=1, + inplace=True) + self.dataframe.rename(columns=COLUMNS_RENAMING, inplace=True) + + logger.info(f'CodeSearchNet {language} {mode} dataset length: {len(self.dataframe)}') + + def get_pandas(self): + return self.dataframe diff --git a/src/datasets/PyTorrent/make_dataset.py b/src/datasets/PyTorrent/make_dataset.py index df7d0ca..ce9dc29 100644 --- a/src/datasets/PyTorrent/make_dataset.py +++ b/src/datasets/PyTorrent/make_dataset.py @@ -95,4 +95,4 @@ def get_pandas(self): if __name__ == '__main__': - sample_set = PyTorrentDataset('train', download=False, reload=True, max_chunks=-1) + sample_set = PyTorrentDataset('test', download=False, reload=True, max_chunks=-1) diff --git a/src/datasets/StaQC/make_dataset.py b/src/datasets/StaQC/make_dataset.py index 7c6be8b..b60bf93 100644 --- a/src/datasets/StaQC/make_dataset.py +++ b/src/datasets/StaQC/make_dataset.py @@ -63,8 +63,7 @@ def __init__(self, mode, language, min_tokens: int = 3): self.min_tokens = min_tokens self.dataframe = self._load_dataframe() - # print(self.dataframe.sample(5)) - + logger.info(f'StaQC {language} {mode} dataset length: {len(self.dataframe)}') def _load_dataframe(self): diff --git a/src/datasets/make_datasets.py b/src/datasets/make_datasets.py index 521248b..dc82cbc 100644 --- a/src/datasets/make_datasets.py +++ b/src/datasets/make_datasets.py @@ -3,6 +3,8 @@ from src.datasets.CoDesc.make_dataset import CoDescDataset from src.datasets.CoDesc.make_dataset import MODES as CODESC_MODES +from src.datasets.CodeSearchNet.make_dataset import CodeSearchNet +from src.datasets.CodeSearchNet.make_dataset import MODES as CSN_MODES from src.datasets.PyTorrent.make_dataset import MODES as PYTORRENT_MODES from src.datasets.PyTorrent.make_dataset import PyTorrentDataset from src.datasets.StaQC.make_dataset import MODES as STAQC_MODES @@ -11,6 +13,16 @@ from src.datasets.XLCoST.make_dataset import XLCoSTDataset +def create_csn_dataset(language, **kwargs): + dataset = datasets.DatasetDict() + dataframes = [CodeSearchNet(mode, language).get_pandas() for mode in CSN_MODES] + + dataset['train'] = datasets.Dataset.from_pandas(dataframes[0]) + dataset['val'] = datasets.Dataset.from_pandas(dataframes[1]) + dataset['test'] = datasets.Dataset.from_pandas(dataframes[2]) + return dataset + + def create_python_dataset(max_length): dataset = datasets.DatasetDict() pytorrent_dataframes = [PyTorrentDataset(mode, code_tokens_cutoff_len=max_length).get_pandas() for mode in diff --git a/src/fixtures.py b/src/fixtures.py index 6c23785..a8fed66 100644 --- a/src/fixtures.py +++ b/src/fixtures.py @@ -1,5 +1,17 @@ +from functools import partial + from src.datasets.make_datasets import create_python_dataset, create_java_dataset, \ - create_csharp_dataset, create_sql_dataset, create_cpp_dataset + create_csharp_dataset, create_sql_dataset, create_cpp_dataset, create_csn_dataset -DATASET_MAP = {'Python': create_python_dataset, 'Java': create_java_dataset, - 'Csharp': create_csharp_dataset, 'SQL': create_sql_dataset, 'C++': create_cpp_dataset} \ No newline at end of file +DATASET_MAP = {'Python': create_python_dataset, + 'Java': create_java_dataset, + 'Csharp': create_csharp_dataset, + 'SQL': create_sql_dataset, + 'C++': create_cpp_dataset, + 'CSN_ruby': partial(create_csn_dataset, language='ruby'), + 'CSN_javascript': partial(create_csn_dataset, language='javascript'), + 'CSN_java': partial(create_csn_dataset, language='java'), + 'CSN_go': partial(create_csn_dataset, language='go'), + 'CSN_php': partial(create_csn_dataset, language='php'), + 'CSN_python': partial(create_csn_dataset, language='python') + } diff --git a/src/metrics.py b/src/metrics.py index 998f828..5969c70 100644 --- a/src/metrics.py +++ b/src/metrics.py @@ -20,7 +20,7 @@ def mrr(similarity_matrix): ranks = [] for i in range(found.shape[0]): rank = 1 + np.where(found[i] == i)[0][0] - ranks.append(1 / rank if rank < 1000 else 0) + ranks.append(1 / rank) return np.mean(ranks).astype(float) diff --git a/src/models/evaluation.py b/src/models/evaluation.py index 635f225..92c6787 100644 --- a/src/models/evaluation.py +++ b/src/models/evaluation.py @@ -1,8 +1,10 @@ import os import sys +from functools import partial import numpy as np import torch +import torch.utils.data from fire import Fire from loguru import logger from peft import PeftModel, PeftConfig @@ -10,6 +12,8 @@ from tqdm.auto import tqdm from transformers import AutoModel, AutoTokenizer, DataCollatorForSeq2Seq +CSN_BATCH_SIZE = 1000 + def evaluation(model, test_loader, device, desc='Test set MRR = ', max_batches=float('inf')): """ @@ -51,8 +55,44 @@ def evaluation(model, test_loader, device, desc='Test set MRR = ', max_batches=f logger.info(f'{desc}{test_MRR}') +def evaluation_csn(model, test_loader: torch.utils.data.DataLoader, device, desc='Test set MRR = ', **kwargs): + from src.metrics import mrr + + model.eval() + test_text_embeddings = [] + test_code_embeddings = [] + model_type = str(type(model)).lower() + is_encoder_only = '220m-bimodal' not in model_type and 'seq2seq' not in model_type + + test_sum_MRR = 0 + for i, batch in tqdm(enumerate(test_loader), total=len(test_loader), + desc='Testing the model', position=0): + batch.to(device) + text_batch = batch.pop("labels") + + with torch.no_grad(): + if is_encoder_only: + test_code_embeddings.append(model(**batch).cpu()) + test_text_embeddings.append(model(text_batch, attention_mask=(text_batch != 0).int()).cpu()) + else: + test_code_embeddings.append(model.encoder(batch['input_ids']).last_hidden_state[:, 0, :].cpu()) + test_text_embeddings.append(model.encoder(text_batch).last_hidden_state[:, 0, :].cpu()) + if len(test_code_embeddings) == CSN_BATCH_SIZE // test_loader.batch_size: + test_text_embeddings = np.concatenate(test_text_embeddings, 0) + test_code_embeddings = np.concatenate(test_code_embeddings, 0) + similarity_matrix = test_text_embeddings @ test_code_embeddings.T + assert similarity_matrix.shape == ( + 1000, 1000), f'Similarity matrix should have shape (1000, 1000), found: {similarity_matrix.shape}' + test_sum_MRR += mrr(similarity_matrix) + test_text_embeddings = [] + test_code_embeddings = [] + num_csn_batches = len(test_loader.dataset) // CSN_BATCH_SIZE + logger.info(f'{desc}{test_sum_MRR / num_csn_batches}') + + def eval_peft_model(tuned_ckpt_path: str, language: str, + evaluation_function, model_max_src_length: int = 128, model_max_tgt_length: int = 128, batch_size: int = 16, @@ -62,6 +102,7 @@ def eval_peft_model(tuned_ckpt_path: str, :param tuned_ckpt_path: Local checkpoint path (for instance, "checkpoints/codet5p-110m-embedding/adalora-cpp") :param language: str, language to test the model on + :param evaluation_functon: :param model_max_src_length: Max tokens in text encoding :param model_max_tgt_length: max tokens in code encoding :param batch_size: testing dataloader's batch size @@ -89,11 +130,12 @@ def eval_peft_model(tuned_ckpt_path: str, data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=config.base_model_name_or_path) test_loader = DataLoader(tokenized_dataset['test'], batch_size=batch_size, shuffle=False, collate_fn=data_collator) - evaluation(model, test_loader, device, desc=f'Test MRR for {language} = ', max_batches=max_batches) + evaluation_function(model, test_loader, device, desc=f'Test MRR for {language} = ', max_batches=max_batches) def eval_base_model(model_name: str, language: str, + evaluation_function, model_max_src_length: int = 128, model_max_tgt_length: int = 128, batch_size: int = 16, @@ -103,6 +145,7 @@ def eval_base_model(model_name: str, :param model_name: Model checkpoint on huggingface hub, for instance, "Salesforce/codet5p-110m-embedding" :param language: str, language to test the model on + :param evaluation_function: :param model_max_src_length: Max tokens in text encoding :param model_max_tgt_length: max tokens in code encoding :param batch_size: testing dataloader's batch size @@ -111,8 +154,10 @@ def eval_base_model(model_name: str, :param device_type: torch.device(device_type) will contain the model :return: None, print MRR to the std output """ - device = torch.device(device_type) + assert (evaluation_function == evaluation_csn and CSN_BATCH_SIZE % batch_size == 0) \ + or evaluation_function == evaluation, f'For CSN evaluation batch_size should be a divisor of 1000)' + device = torch.device(device_type) model = AutoModel.from_pretrained(model_name, device_map={"": 0}, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) @@ -122,7 +167,7 @@ def eval_base_model(model_name: str, data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name) test_loader = DataLoader(tokenized_dataset['test'], batch_size=batch_size, shuffle=False, collate_fn=data_collator) - evaluation(model, test_loader, device, desc=f'Test 0-shot MRR for {language} = ', max_batches=max_batches) + evaluation_function(model, test_loader, device, desc=f'Test 0-shot MRR for {language} = ', max_batches=max_batches) if __name__ == '__main__': @@ -132,6 +177,12 @@ def eval_base_model(model_name: str, from src.models.train import _setup_seq2seq_dataset Fire({ - 'base': eval_base_model, - 'peft': eval_peft_model, + 'base': { + 'codebert': partial(eval_base_model, evaluation_function=evaluation), + 'csn': partial(eval_base_model, evaluation_function=evaluation_csn) + }, + 'peft': { + 'codebert': partial(eval_peft_model, evaluation_function=evaluation), + 'csn': partial(eval_peft_model, evaluation_function=evaluation_csn) + } }) diff --git a/src/models/rag.py b/src/models/rag.py new file mode 100644 index 0000000..aecc037 --- /dev/null +++ b/src/models/rag.py @@ -0,0 +1,184 @@ +import os +import sys +import warnings +from typing import List + +import evaluate +import numpy as np +import torch +from datasets import load_dataset +from fire import Fire +from langchain.docstore.document import Document as LangchainDocument +from langchain.llms.huggingface_pipeline import HuggingFacePipeline +from langchain.pydantic_v1 import BaseModel +from langchain.schema.embeddings import Embeddings +from langchain.vectorstores.faiss import FAISS +from langchain_community.vectorstores.utils import DistanceStrategy +from loguru import logger +from peft import PeftConfig, PeftModel +from torch.utils.data import DataLoader +from tqdm.auto import tqdm +from transformers import AutoModel, AutoTokenizer, DataCollatorForSeq2Seq, pipeline, AutoModelForCausalLM + +PROMPT_IN_CHAT_FORMAT = [ + { + "role": "system", + "content": """Using the information contained in the context, +give a comprehensive answer to the question. +Respond only to the question asked, response should be concise and relevant to the question. +Provide the number of the source document when relevant. +If the answer cannot be deduced from the context, do not give an answer.""", + }, + { + "role": "user", + "content": """Context: +{context} +--- +Now here is the question you need to answer. + +Question: {question}""", + }, +] + + +class CodeT5PlusEmbedder(BaseModel, Embeddings, extra='allow'): + def __init__(self, model_name_or_path, device_type='cuda', pbar=True, *args, **kwargs): + super().__init__(*args, **kwargs) + self.device = torch.device(device_type) + try: + config = PeftConfig.from_pretrained(model_name_or_path) + model = AutoModel.from_pretrained(config.base_model_name_or_path, device_map={"": 0}, trust_remote_code=True) + self.model = PeftModel.from_pretrained(model, model_name_or_path).to(self.device) + self.tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path, trust_remote_code=True) + except: + self.model = AutoModel.from_pretrained(model_name_or_path, device_map={"": 0}, trust_remote_code=True).to(self.device) + self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True) + self.data_collator = DataCollatorForSeq2Seq(tokenizer=self.tokenizer, model=model_name_or_path) + self.pbar = pbar + + def _inference(self, texts: List[str]) -> List[List[float]]: + model_inputs = [self.tokenizer(text, max_length=256, padding='max_length', truncation=True, return_tensors='pt') + for text in texts] + loader = DataLoader(model_inputs, batch_size=32, shuffle=False, collate_fn=self.data_collator) + embeddings = [] + batches = loader if not self.pbar else tqdm(loader, total=len(loader), desc='Iterating over documents') + for batch in batches: + batch['input_ids'] = batch['input_ids'].squeeze(1) + batch['attention_mask'] = batch['attention_mask'].squeeze(1) + batch.to(self.device) + with torch.no_grad(): + embeddings.append(self.model(**batch).cpu()) + embeddings = np.concatenate(embeddings, 0).reshape(-1, embeddings[0].shape[-1]) + return embeddings.tolist() + + def embed_documents(self, texts: List[str]) -> List[List[float]]: + return self._inference(texts) + + def embed_query(self, text: str) -> List[float]: + return self._inference([text])[0] + + +def construct_pipeline(tuned_ckpt_path, language, reader_name, device_type='cuda', is_interactive=True): + embedder = CodeT5PlusEmbedder(tuned_ckpt_path, device_type=device_type, pbar=is_interactive) + logger.info('Embedder model created') + raw_dataset = load_dataset("code_search_net", language)['test'].to_pandas() + dataset = raw_dataset['func_code_string'].to_list() + code_chunks = [LangchainDocument(sample) for sample in dataset] + logger.info('Started vector DB construction') + db = FAISS.from_documents(code_chunks, embedder, distance_strategy=DistanceStrategy.COSINE) + + tokenizer = AutoTokenizer.from_pretrained(reader_name, padding=True, truncation=True, + max_length=512, trust_remote_code=True) + + model = AutoModelForCausalLM.from_pretrained(reader_name, trust_remote_code=True).to(torch.device(device_type)) + + code_generator = pipeline( + task="text-generation", + model=model, + tokenizer=tokenizer, + do_sample=True, + temperature=0.2, + repetition_penalty=1.1, + return_full_text=False, + max_new_tokens=512, + device=next(model.parameters()).device + ) + + reader_llm = HuggingFacePipeline( + pipeline=code_generator, + model_kwargs={"temperature": 0.5, "max_length": 512, "device": "cuda"}, + ) + logger.info('Reader LLM created') + + rag_prompt = tokenizer.apply_chat_template( + PROMPT_IN_CHAT_FORMAT, tokenize=False, add_generation_prompt=True + ) + logger.info('RAG pipeline ready') + if is_interactive: + return reader_llm, db, rag_prompt + else: + return reader_llm, db, rag_prompt, raw_dataset + + +def rag_inference(reader_llm, vector_db, rag_prompt, query): + retrieved_docs = vector_db.similarity_search(query=query, k=5) + retrieved_docs_text = [doc.page_content for doc in retrieved_docs] + context = "\nExtracted documents:\n" + context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(retrieved_docs_text)]) + + final_prompt = rag_prompt.format(question=query, context=context) + + answer = reader_llm(final_prompt) + return answer + + +def rag_benchmarking(embedder_name_or_path, + language, + reader_name="deepseek-ai/deepseek-coder-6.7b-instruct", + device_type="cuda"): + reader_llm, vector_db, rag_prompt, dataset = construct_pipeline(embedder_name_or_path, reader_name=reader_name, + language=language, device_type=device_type, + is_interactive=False) + rouge = evaluate.load("rouge") + labels = [] + predictions = [] + max_samples = 1000 + for query, gt_code in tqdm(zip(dataset['func_documentation_string'][:max_samples], + dataset['func_code_string'][:max_samples]), + total=max_samples, + desc='Calculating ROUGE'): + response = rag_inference(reader_llm, vector_db, rag_prompt, query) + predictions.append(response) + labels.append(gt_code) + + result = rouge.compute(predictions=predictions, references=labels, use_stemmer=True) + logger.info(f'ROUGE of RAG generation = {result}') + + +def rag_interactive_inference(embedder_path, + language, + reader_name="deepseek-ai/deepseek-coder-1.3b-base", + device_type="cuda"): + reader_llm, vector_db, rag_prompt = construct_pipeline(embedder_path, reader_name=reader_name, + language=language, device_type=device_type) + + while True: + user_query = input("Enter your query. To finish inference, enter <>: ") + if user_query == '<>': + break + rag_answer = rag_inference(reader_llm, vector_db, rag_prompt, user_query) + print("RAG answer: ") + print(rag_answer) + print('\n', '=' * 45) + + +if __name__ == '__main__': + sys.path.append(os.getcwd()) + warnings.filterwarnings('ignore') + + Fire( + { + 'interactive': rag_interactive_inference, + 'benchmarking': rag_benchmarking + } + )