Source code for realtabformer.rtf_datacollator

from dataclasses import dataclass
from typing import Optional

import numpy as np
import torch


@dataclass
[docs]class RelationalDataCollator:
    """
    Data collator that will dynamically pad the inputs received, as well as the labels.
    Adopted from the DataCollatorForSeq2Seq:
     https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/data/data_collator.py#L510

    Args:
        max_length (`int`, *optional*):
            Maximum length of the returned list and optionally padding length (see above).
        pad_to_multiple_of (`int`, *optional*):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
        label_pad_token_id (`int`, *optional*, defaults to -100):
            The id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).
        return_tensors (`str`):
            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
    """

[docs]    max_length: Optional[int] = None
[docs]    pad_to_multiple_of: Optional[int] = None
[docs]    label_pad_token_id: int = -100
[docs]    return_tensors: str = "pt"

[docs]    def __call__(self, features, return_tensors=None):
        if return_tensors is None:
            return_tensors = self.return_tensors

        labels = (
            [feature["labels"] for feature in features]
            if "labels" in features[0].keys()
            else None
        )

        # We have to pad the labels before calling `tokenizer.pad` as this method won't pad them and needs them of the
        # same length to return tensors.
        if labels is not None:
            max_label_length = max(len(label) for label in labels)
            if self.pad_to_multiple_of is not None:
                max_label_length = (
                    (max_label_length + self.pad_to_multiple_of - 1)
                    // self.pad_to_multiple_of
                    * self.pad_to_multiple_of
                )

            for feature in features:
                remainder = [self.label_pad_token_id] * (
                    max_label_length - len(feature["labels"])
                )
                if isinstance(feature["labels"], list):
                    feature["labels"] = feature["labels"] + remainder
                else:
                    # Pad always at the right.
                    feature["labels"] = np.concatenate(
                        [feature["labels"], remainder]
                    ).astype(np.int64)

        labels = [feature["labels"] for feature in features]
        input_ids = [feature["input_ids"] for feature in features]

        if return_tensors == "np":
            labels = np.vstack(labels)
            input_ids = np.vstack(input_ids)
        elif return_tensors == "pt":
            labels = torch.vstack([torch.tensor(label) for label in labels])
            input_ids = torch.vstack([torch.tensor(ii) for ii in input_ids])
        elif return_tensors == "tf":
            raise ValueError("Tensorflow tensor is not supported yet.")

        return dict(
            labels=labels,
            input_ids=input_ids,
        )