embeddings - LM-based models inference

Warning

WIP - This tutorial is a work in progress. We will update and validate the content in the coming weeks.

import os

os.chdir("..")
from typing import Any, Dict

import pytorch_lightning as pl
from embeddings.config.lightning_config import LightningAdvancedConfig
from embeddings.defaults import DATASET_PATH, RESULTS_PATH
from embeddings.model.lightning_module.text_classification import (
    TextClassificationModule,
)
from embeddings.pipeline.hf_preprocessing_pipeline import (
    HuggingFacePreprocessingPipeline,
)
from embeddings.pipeline.lightning_classification import LightningClassificationPipeline
from embeddings.task.lightning_task.text_classification import TextClassificationTask
from embeddings.utils.utils import build_output_path

/opt/conda/envs/embeddings/lib/python3.9/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

embedding_name_or_path = "hf-internal-testing/tiny-albert"
dataset_name = "clarin-pl/polemo2-official"

dataset_path = build_output_path(DATASET_PATH, embedding_name_or_path, dataset_name)
output_path = build_output_path(RESULTS_PATH, embedding_name_or_path, dataset_name)

2023-02-13 23:05:26,246 - embeddings.utils.utils - WARNING - String 'hf-internal-testing/tiny-albert' contains '/'. Replacing it with '__'. Cleaned_text: hf-internal-testing__tiny-albert.
2023-02-13 23:05:26,247 - embeddings.utils.utils - WARNING - String 'clarin-pl/polemo2-official' contains '/'. Replacing it with '__'. Cleaned_text: clarin-pl__polemo2-official.
2023-02-13 23:05:26,254 - embeddings.utils.utils - WARNING - String 'hf-internal-testing/tiny-albert' contains '/'. Replacing it with '__'. Cleaned_text: hf-internal-testing__tiny-albert.
2023-02-13 23:05:26,256 - embeddings.utils.utils - WARNING - String 'clarin-pl/polemo2-official' contains '/'. Replacing it with '__'. Cleaned_text: clarin-pl__polemo2-official.

Preprocess and downsample data

def preprocess_data(path: str) -> Dict[str, Any]:
    pipeline = HuggingFacePreprocessingPipeline(
        dataset_name=dataset_name,
        load_dataset_kwargs={
            "train_domains": ["hotels", "medicine"],
            "dev_domains": ["hotels", "medicine"],
            "test_domains": ["hotels", "medicine"],
            "text_cfg": "text",
        },
        persist_path=path,
        sample_missing_splits=None,
        ignore_test_subset=False,
        downsample_splits=(0.01, 0.01, 0.05),
        seed=441,
    )
    pipeline.run()

    return {
        "dataset_name_or_path": path,
        "input_column_name": ["text"],
        "target_column_name": "target",
    }


dataset_kwargs = preprocess_data(dataset_path)

Using custom data configuration default-e0c1ce6ddfd81769
Found cached dataset polemo2-official (/root/.cache/huggingface/datasets/clarin-pl___polemo2-official/default-e0c1ce6ddfd81769/0.0.0/2b75fdbe5def97538e81fb120f8752744b50729a4ce09bd75132bfc863a2fd70)
100%|██████████| 3/3 [00:00<00:00, 817.23it/s]
Loading cached split indices for dataset at /root/.cache/huggingface/datasets/clarin-pl___polemo2-official/default-e0c1ce6ddfd81769/0.0.0/2b75fdbe5def97538e81fb120f8752744b50729a4ce09bd75132bfc863a2fd70/cache-a54edce9681df8b7.arrow and /root/.cache/huggingface/datasets/clarin-pl___polemo2-official/default-e0c1ce6ddfd81769/0.0.0/2b75fdbe5def97538e81fb120f8752744b50729a4ce09bd75132bfc863a2fd70/cache-09cf731207f31628.arrow
Loading cached split indices for dataset at /root/.cache/huggingface/datasets/clarin-pl___polemo2-official/default-e0c1ce6ddfd81769/0.0.0/2b75fdbe5def97538e81fb120f8752744b50729a4ce09bd75132bfc863a2fd70/cache-c48721732fabb729.arrow and /root/.cache/huggingface/datasets/clarin-pl___polemo2-official/default-e0c1ce6ddfd81769/0.0.0/2b75fdbe5def97538e81fb120f8752744b50729a4ce09bd75132bfc863a2fd70/cache-f9d782422a65c7e6.arrow
Loading cached split indices for dataset at /root/.cache/huggingface/datasets/clarin-pl___polemo2-official/default-e0c1ce6ddfd81769/0.0.0/2b75fdbe5def97538e81fb120f8752744b50729a4ce09bd75132bfc863a2fd70/cache-0db6321193feb3ec.arrow and /root/.cache/huggingface/datasets/clarin-pl___polemo2-official/default-e0c1ce6ddfd81769/0.0.0/2b75fdbe5def97538e81fb120f8752744b50729a4ce09bd75132bfc863a2fd70/cache-4e6c26839c3e4adf.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/clarin-pl___polemo2-official/default-e0c1ce6ddfd81769/0.0.0/2b75fdbe5def97538e81fb120f8752744b50729a4ce09bd75132bfc863a2fd70/cache-e32b75da1d28bfd0.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/clarin-pl___polemo2-official/default-e0c1ce6ddfd81769/0.0.0/2b75fdbe5def97538e81fb120f8752744b50729a4ce09bd75132bfc863a2fd70/cache-98cbedcc70a23855.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/clarin-pl___polemo2-official/default-e0c1ce6ddfd81769/0.0.0/2b75fdbe5def97538e81fb120f8752744b50729a4ce09bd75132bfc863a2fd70/cache-b2cbb8ab856bac0f.arrow

Train simple downsampled pipeline

config = LightningAdvancedConfig(
    finetune_last_n_layers=0,
    task_train_kwargs={"max_epochs": 1, "deterministic": True,},
    task_model_kwargs={
        "learning_rate": 5e-4,
        "train_batch_size": 32,
        "eval_batch_size": 32,
        "use_scheduler": True,
        "optimizer": "AdamW",
        "adam_epsilon": 1e-8,
        "warmup_steps": 100,
        "weight_decay": 0.0,
    },
    datamodule_kwargs={"max_seq_length": 64,},
    early_stopping_kwargs={"monitor": "val/Loss", "mode": "min", "patience": 3,},
    tokenizer_kwargs={},
    batch_encoding_kwargs={},
    dataloader_kwargs={},
    model_config_kwargs={},
)

pipeline = LightningClassificationPipeline(
    embedding_name_or_path=embedding_name_or_path,
    output_path=output_path,
    config=config,
    devices="auto",
    accelerator="cpu",
    **dataset_kwargs
)
result = pipeline.run()

100%|██████████| 1/1 [00:00<00:00,  6.73ba/s]
100%|██████████| 1/1 [00:00<00:00, 25.69ba/s]
100%|██████████| 1/1 [00:00<00:00, 26.51ba/s]
Casting the dataset: 100%|██████████| 1/1 [00:00<00:00, 113.84ba/s]
Casting the dataset: 100%|██████████| 1/1 [00:00<00:00, 68.70ba/s]
Casting the dataset: 100%|██████████| 1/1 [00:00<00:00, 103.15ba/s]
Some weights of the model checkpoint at hf-internal-testing/tiny-albert were not used when initializing AlbertForSequenceClassification: ['predictions.LayerNorm.bias', 'predictions.LayerNorm.weight', 'predictions.bias', 'predictions.decoder.weight', 'predictions.decoder.bias', 'predictions.dense.bias', 'predictions.dense.weight']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at hf-internal-testing/tiny-albert and are newly initialized: ['classifier.bias', 'albert.pooler.weight', 'albert.pooler.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
/opt/conda/envs/embeddings/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1579: UserWarning: GPU available but not used. Set the gpus flag in your trainer `Trainer(gpus=1)` or script `--gpus=1`.
  rank_zero_warn(

  | Name          | Type                            | Params
------------------------------------------------------------------
0 | model         | AlbertForSequenceClassification | 352 K 
1 | metrics       | MetricCollection                | 0     
2 | train_metrics | MetricCollection                | 0     
3 | val_metrics   | MetricCollection                | 0     
4 | test_metrics  | MetricCollection                | 0     
------------------------------------------------------------------
132       Trainable params
352 K     Non-trainable params
352 K     Total params
1.410     Total estimated model params size (MB)
/opt/conda/envs/embeddings/lib/python3.9/site-packages/pytorch_lightning/trainer/data_loading.py:111: UserWarning: The dataloader, val_dataloader 0, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 48 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.
  rank_zero_warn(
/opt/conda/envs/embeddings/lib/python3.9/site-packages/pytorch_lightning/trainer/data_loading.py:111: UserWarning: The dataloader, train_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 48 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.
  rank_zero_warn(
/opt/conda/envs/embeddings/lib/python3.9/site-packages/pytorch_lightning/trainer/data_loading.py:407: UserWarning: The number of training samples (2) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.
  rank_zero_warn(
/opt/conda/envs/embeddings/lib/python3.9/site-packages/pytorch_lightning/trainer/data_loading.py:111: UserWarning: The dataloader, test_dataloader 0, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 48 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.
  rank_zero_warn(
Restoring states from the checkpoint path at /app/resources/results/hf-internal-testing__tiny-albert/clarin-pl__polemo2-official/20230213_230526/checkpoints/epoch=0-step=1.ckpt
Loaded model weights from checkpoint at /app/resources/results/hf-internal-testing__tiny-albert/clarin-pl__polemo2-official/20230213_230526/checkpoints/epoch=0-step=1.ckpt
/opt/conda/envs/embeddings/lib/python3.9/site-packages/pytorch_lightning/trainer/data_loading.py:111: UserWarning: The dataloader, predict_dataloader 0, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 48 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.
  rank_zero_warn(
/app/embeddings/metric/hugging_face_metric.py:27: FutureWarning: load_metric is deprecated and will be removed in the next major version of datasets. Use 'evaluate.load' instead, from the new library 🤗 Evaluate: https://huggingface.co/docs/evaluate
  datasets.load_metric(metric, **init_kwargs) if isinstance(metric, str) else metric
/opt/conda/envs/embeddings/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/opt/conda/envs/embeddings/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/opt/conda/envs/embeddings/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

Validation sanity check:   0%|          | 0/1 [00:00<?, ?it/s]                                                                      Epoch 0: 100%|██████████| 3/3 [00:01<00:00,  2.07it/s, loss=1.39, v_num=, train/BaseLR=5e-6, train/LambdaLR=5e-6, val/MulticlassAccuracy=0.375, val/MulticlassPrecision=0.0938, val/MulticlassRecall=0.250, val/MulticlassF1Score=0.136]
Testing: 100%|██████████| 2/2 [00:02<00:00,  1.02s/it]--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test/Loss': 1.3823001384735107,
 'test/MulticlassAccuracy': 0.4054054021835327,
 'test/MulticlassF1Score': 0.14423076808452606,
 'test/MulticlassPrecision': 0.10135135054588318,
 'test/MulticlassRecall': 0.25}
--------------------------------------------------------------------------------
Testing: 100%|██████████| 2/2 [00:02<00:00,  1.04s/it]
Predicting: 100%|██████████| 2/2 [00:01<?, ?it/s]

Load model from chechpoint automatically generated with Trainer

ckpt_path = output_path / "checkpoints" / "last.ckpt"
ckpt_path

task_from_ckpt = TextClassificationTask.from_checkpoint(
    checkpoint_path=ckpt_path, output_path=output_path,
)

Alternatively we can load the model

model_from_ckpt = TextClassificationModule.load_from_checkpoint(str(ckpt_path))

The warning appears when loading the model, however, it was validated that the loaded weights are the same as the weights that are being saved. The reason for this is that when the model_state_dict keys are loaded from the cached huggingface model some of them (cls.(…)) do not match the keys from the state_dict of the model weights that are saved.

https://github.com/CLARIN-PL/embeddings/issues/225

Use task from checkpoint for predictions

return_names needs to be set to False since it uses the datamodule to retrieves the names while the datamodule is not loaded to Trainer in the LightningTask since we have not fitted it yet.

test_dataloader = pipeline.datamodule.test_dataloader()
preds = task_from_ckpt.predict(test_dataloader)
preds

Alternatively we can implicitly assign the datamodule to Trainer in LightningTask

task_from_ckpt.trainer.datamodule = pipeline.datamodule
preds_with_names = task_from_ckpt.predict(test_dataloader, return_names=True)
preds_with_names

We can also use previosly loaded lightning model (LightningModule) outside of the task and get the predictions. To do this we also need to intitialize a Trainer.

trainer = pl.Trainer(default_root_dir=str(output_path))
preds_from_model = trainer.predict(model_from_ckpt, dataloaders=test_dataloader)
preds_from_model