-
Notifications
You must be signed in to change notification settings - Fork 14
Open
Description
hello, I have gotten this model working on training custom data. it works through the surrogate and extreme steps, but during the calculation of the ANN/shortlist, it seems to be trying to pickle an extremely large file. it throws this error:
File "/data/ebay/notebooks/chmo/astec/programs/deepxml/deepxml/main.py", line 428, in main
output = train(model, params)
File "/data/ebay/notebooks/chmo/astec/programs/deepxml/deepxml/main.py", line 107, in train
surrogate_mapping=params.surrogate_mapping)
File "/data/ebay/notebooks/chmo/astec/programs/deepxml/deepxml/libs/model.py", line 528, in fit
use_intermediate_for_shorty, precomputed_intermediate)
File "/data/ebay/notebooks/chmo/astec/programs/deepxml/deepxml/libs/model.py", line 337, in _fit
self.save_checkpoint(model_dir, epoch+1)
File "/data/ebay/notebooks/chmo/astec/programs/deepxml/deepxml/libs/model.py", line 695, in save_checkpoint
model_dir, self.tracking.saved_checkpoints[-1]['ANN']))
File "/data/ebay/notebooks/chmo/astec/programs/deepxml/deepxml/libs/shortlist.py", line 105, in save
self.knn.save(fname+'.knn')
File "/home/chmo/.local/lib/python3.7/site-packages/xclib-0.97-py3.7-linux-x86_64.egg/xclib/utils/shortlist.py", line 396, in save
'space': self.space}, open(fname+".metadata", 'wb'))
OverflowError: cannot serialize a bytes object larger than 4 GiB
is this where it is saving the knn checkpoint? why is the object so large that it is > 4GB? I am wondering because in the paper it is reported that the model was trained on a dataset with a label space of 62 million. I am also running this training with p100 and 61 workers
stats on my dataset:
number of features: 111,786
number of labels: 699,001
and my config:
{
"global": {
"dataset": "leaf",
"feature_type": "sparse",
"num_labels": 699001,
"arch": "Astec",
"A": 0.55,
"B": 1.5,
"use_reranker": true,
"surrogate_threshold": 512,
"surrogate_method": 1,
"embedding_dims": 391,
"top_k": 100,
"beta": 0.3,
"save_predictions": true,
"trn_label_fname": "trn_X_Y.txt",
"val_label_fname": "tst_X_Y.txt",
"tst_label_fname": "tst_X_Y.txt",
"trn_feat_fname": "trn_X_Xf.txt",
"val_feat_fname": "tst_X_Xf.txt",
"tst_feat_fname": "tst_X_Xf.txt"
},
"surrogate": {
"num_epochs": 20,
"dlr_factor": 0.5,
"learning_rate": 0.01,
"batch_size": 255,
"dlr_step": 14,
"normalize": true,
"init": "token_embeddings",
"optim": "Adam",
"embeddings": "fasttextB_embeddings_391d.npy",
"validate": true,
"save_intermediate": true
},
"extreme": {
"num_epochs": 20,
"dlr_factor": 0.5,
"learning_rate": 0.007,
"batch_size": 255,
"dlr_step": 14,
"ns_method": "ensemble",
"num_centroids": 1,
"efC": 300,
"efS": 400,
"M": 100,
"num_nbrs": 500,
"ann_threads": 18,
"beta": 0.5,
"surrogate_mapping": true,
"num_clf_partitions": 1,
"optim": "Adam",
"freeze_intermediate": true,
"validate": true,
"model_method": "shortlist",
"normalize": true,
"shortlist_method": "hybrid",
"init": "intermediate",
"use_shortlist": true,
"use_intermediate_for_shorty": true
},
"reranker": {
"num_epochs": 20,
"dlr_factor": 0.5,
"learning_rate": 0.005,
"batch_size": 255,
"dlr_step": 10,
"beta": 0.6,
"num_clf_partitions": 1,
"optim": "Adam",
"validate": true,
"model_method": "reranker",
"shortlist_method": "static",
"surrogate_mapping": true,
"normalize": true,
"use_shortlist": true,
"init": "token_embeddings",
"save_intermediate": false,
"keep_invalid": true,
"freeze_intermediate": false,
"update_shortlist": false,
"use_pretrained_shortlist": true,
"embeddings": "fasttextB_embeddings_391d.npy"
}
}
Metadata
Metadata
Assignees
Labels
No labels