Source code for realtabformer.rtf_datacollator

from dataclasses import dataclass
from typing import Optional

import numpy as np
import torch


@dataclass
[docs]class RelationalDataCollator: """ Data collator that will dynamically pad the inputs received, as well as the labels. Adopted from the DataCollatorForSeq2Seq: https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/data/data_collator.py#L510 Args: max_length (`int`, *optional*): Maximum length of the returned list and optionally padding length (see above). pad_to_multiple_of (`int`, *optional*): If set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). label_pad_token_id (`int`, *optional*, defaults to -100): The id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions). return_tensors (`str`): The type of Tensor to return. Allowable values are "np", "pt" and "tf". """
[docs] max_length: Optional[int] = None
[docs] pad_to_multiple_of: Optional[int] = None
[docs] label_pad_token_id: int = -100
[docs] return_tensors: str = "pt"
[docs] def __call__(self, features, return_tensors=None): if return_tensors is None: return_tensors = self.return_tensors labels = ( [feature["labels"] for feature in features] if "labels" in features[0].keys() else None ) # We have to pad the labels before calling `tokenizer.pad` as this method won't pad them and needs them of the # same length to return tensors. if labels is not None: max_label_length = max(len(label) for label in labels) if self.pad_to_multiple_of is not None: max_label_length = ( (max_label_length + self.pad_to_multiple_of - 1) // self.pad_to_multiple_of * self.pad_to_multiple_of ) for feature in features: remainder = [self.label_pad_token_id] * ( max_label_length - len(feature["labels"]) ) if isinstance(feature["labels"], list): feature["labels"] = feature["labels"] + remainder else: # Pad always at the right. feature["labels"] = np.concatenate( [feature["labels"], remainder] ).astype(np.int64) labels = [feature["labels"] for feature in features] input_ids = [feature["input_ids"] for feature in features] if return_tensors == "np": labels = np.vstack(labels) input_ids = np.vstack(input_ids) elif return_tensors == "pt": labels = torch.vstack([torch.tensor(label) for label in labels]) input_ids = torch.vstack([torch.tensor(ii) for ii in input_ids]) elif return_tensors == "tf": raise ValueError("Tensorflow tensor is not supported yet.") return dict( labels=labels, input_ids=input_ids, )