Source code for mltb2.transformers

# Copyright (c) 2023 Philip May
# This software is distributed under the terms of the MIT license
# which is available at https://opensource.org/licenses/MIT

"""This module offers `Hugging Face Transformers <https://huggingface.co/docs/transformers/index>`_ specific tools.

Hint:
    Use pip to install the necessary dependencies for this module:
    ``pip install mltb2[transformers]``
"""

import os
from dataclasses import dataclass, field
from typing import Iterable, List, Union

import sklearn
import torch
from tqdm import tqdm
from transformers import AutoTokenizer
from transformers.tokenization_utils import PreTrainedTokenizerBase


[docs]@dataclass class TransformersTokenCounter: """Count Transformers tokenizer tokens. Args: pretrained_model_name_or_path: The *model id* of a tokenizer hosted inside a model repo on huggingface.co or a path to a *directory* containing a tokenizer. show_progress_bar: Show a progressbar during processing. """ pretrained_model_name_or_path: Union[str, os.PathLike] tokenizer: PreTrainedTokenizerBase = field(init=False, repr=False) show_progress_bar: bool = False def __post_init__(self): """Do post init.""" self.tokenizer = AutoTokenizer.from_pretrained(self.pretrained_model_name_or_path)
[docs] def __call__(self, text: Union[str, Iterable]) -> Union[int, List[int]]: """Count tokens for text. Args: text: The text for which the tokens are to be counted. Returns: The number of tokens if text was just a ``str``. If text is an ``Iterable`` then a ``list`` of number of tokens. """ if isinstance(text, str): tokenized_text = self.tokenizer.tokenize(text) return len(tokenized_text) else: counts = [] for t in tqdm(text, disable=not self.show_progress_bar): tokenized_text = self.tokenizer.tokenize(t) counts.append(len(tokenized_text)) return counts
[docs]class LabeledDataset(torch.utils.data.Dataset): """Dataset with labes.""" def __init__(self, encodings, labels): self.encodings = encodings self.labels = labels def __getitem__(self, idx): # noqa: D105 item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} item["labels"] = torch.tensor(self.labels[idx]) return item def __len__(self): # noqa: D105 return len(self.labels)
[docs]class KFoldLabeledDataset: """Utility to do k-fold cross-validation on ``LabeledDataset``.""" def __init__(self, n_splits=7, n_repeats=1, random_state=None): self.n_splits = n_splits self.n_repeats = n_repeats self.random_state = random_state
[docs] def split(self, labeled_dataset, stratification_labels=None): """Generates data splits of training and test set.""" idxs = list(range(len(labeled_dataset))) if stratification_labels is None: # no stratification wanted k_fold = sklearn.model_selection.RepeatedKFold( n_splits=self.n_splits, n_repeats=self.n_repeats, random_state=self.random_state, ) k_fold_split = k_fold.split(idxs) else: # stratification wanted k_fold = sklearn.model_selection.RepeatedStratifiedKFold( n_splits=self.n_splits, n_repeats=self.n_repeats, random_state=self.random_state, ) k_fold_split = k_fold.split(idxs, stratification_labels) for train_idxs, test_idxs in k_fold_split: train = torch.utils.data.Subset(labeled_dataset, train_idxs) test = torch.utils.data.Subset(labeled_dataset, test_idxs) yield train, test