Skip to content

training shortlist on large-ish dataset #6

@cairomo

Description

@cairomo

hello, I have gotten this model working on training custom data. it works through the surrogate and extreme steps, but during the calculation of the ANN/shortlist, it seems to be trying to pickle an extremely large file. it throws this error:

File "/data/ebay/notebooks/chmo/astec/programs/deepxml/deepxml/main.py", line 428, in main
    output = train(model, params)
  File "/data/ebay/notebooks/chmo/astec/programs/deepxml/deepxml/main.py", line 107, in train
    surrogate_mapping=params.surrogate_mapping)
  File "/data/ebay/notebooks/chmo/astec/programs/deepxml/deepxml/libs/model.py", line 528, in fit
    use_intermediate_for_shorty, precomputed_intermediate)
  File "/data/ebay/notebooks/chmo/astec/programs/deepxml/deepxml/libs/model.py", line 337, in _fit
    self.save_checkpoint(model_dir, epoch+1)
  File "/data/ebay/notebooks/chmo/astec/programs/deepxml/deepxml/libs/model.py", line 695, in save_checkpoint
    model_dir, self.tracking.saved_checkpoints[-1]['ANN']))
  File "/data/ebay/notebooks/chmo/astec/programs/deepxml/deepxml/libs/shortlist.py", line 105, in save
    self.knn.save(fname+'.knn')
  File "/home/chmo/.local/lib/python3.7/site-packages/xclib-0.97-py3.7-linux-x86_64.egg/xclib/utils/shortlist.py", line 396, in save
    'space': self.space}, open(fname+".metadata", 'wb'))
OverflowError: cannot serialize a bytes object larger than 4 GiB

is this where it is saving the knn checkpoint? why is the object so large that it is > 4GB? I am wondering because in the paper it is reported that the model was trained on a dataset with a label space of 62 million. I am also running this training with p100 and 61 workers

stats on my dataset:
number of features: 111,786
number of labels: 699,001

and my config:

{
    "global": {
        "dataset": "leaf",
        "feature_type": "sparse",
        "num_labels": 699001,
        "arch": "Astec",
        "A": 0.55,
        "B": 1.5,
        "use_reranker": true,
        "surrogate_threshold": 512,
        "surrogate_method": 1,
        "embedding_dims": 391,
        "top_k": 100,
        "beta": 0.3,
        "save_predictions": true,
        "trn_label_fname": "trn_X_Y.txt",
        "val_label_fname": "tst_X_Y.txt",
        "tst_label_fname": "tst_X_Y.txt",
        "trn_feat_fname": "trn_X_Xf.txt",
        "val_feat_fname": "tst_X_Xf.txt",
        "tst_feat_fname": "tst_X_Xf.txt"
    },
    "surrogate": {
        "num_epochs": 20,
        "dlr_factor": 0.5,
        "learning_rate": 0.01,
        "batch_size": 255,
        "dlr_step": 14,
        "normalize": true,
        "init": "token_embeddings",
        "optim": "Adam",
        "embeddings": "fasttextB_embeddings_391d.npy",
        "validate": true,
        "save_intermediate": true
    },
    "extreme": {
        "num_epochs": 20,
        "dlr_factor": 0.5,
        "learning_rate": 0.007,
        "batch_size": 255,
        "dlr_step": 14,
        "ns_method": "ensemble",
        "num_centroids": 1,
        "efC": 300,
        "efS": 400,
        "M": 100,
        "num_nbrs": 500,
        "ann_threads": 18,
        "beta": 0.5,
        "surrogate_mapping": true,
        "num_clf_partitions": 1,
        "optim": "Adam",
        "freeze_intermediate": true,
        "validate": true,
        "model_method": "shortlist",
        "normalize": true,
        "shortlist_method": "hybrid",
        "init": "intermediate",
        "use_shortlist": true,
        "use_intermediate_for_shorty": true
    },
    "reranker": {
        "num_epochs": 20,
        "dlr_factor": 0.5,
        "learning_rate": 0.005,
        "batch_size": 255,
        "dlr_step": 10,
        "beta": 0.6,
        "num_clf_partitions": 1,
        "optim": "Adam",
        "validate": true,
        "model_method": "reranker",
        "shortlist_method": "static",
        "surrogate_mapping": true,
        "normalize": true,
        "use_shortlist": true,
        "init": "token_embeddings",
        "save_intermediate": false,
        "keep_invalid": true,
        "freeze_intermediate": false,
        "update_shortlist": false,
        "use_pretrained_shortlist": true,
        "embeddings": "fasttextB_embeddings_391d.npy"
    }
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions