realtabformer.data_utils#

Module Contents#

Classes#

Functions#

get_uuid()

fix_multi_decimal(v)

build_vocab([df, special_tokens, add_columns])

process_numeric_data(→ Tuple[pandas.Series, Dict])

process_datetime_data(→ Tuple[pandas.Series, Dict])

process_categorical_data(→ pandas.Series)

encode_partition_numeric_col(col, tr, col_zfill)

decode_partition_numeric_col(partition_col)

tokenize_numeric_col(series[, nparts, col_zfill])

encode_column_values(series)

decode_column_values(series)

encode_processed_column(idx, dtype, col)

decode_processed_column(col)

extract_processed_column(col)

is_numeric_col(col)

is_datetime_col(col)

is_categorical_col(col)

is_numeric_datetime_col(col)

process_data(→ Tuple[pandas.DataFrame, Dict])

get_token_id(→ int)

get_input_ids(→ Dict)

make_dataset(→ datasets.Dataset)

get_relational_input_ids(→ dict)

make_relational_dataset(→ datasets.Dataset)

Attributes#

realtabformer.data_utils.TEACHER_FORCING_PRE = '_TEACHERFORCING'[source]#
realtabformer.data_utils.SPECIAL_COL_SEP = '___'[source]#
realtabformer.data_utils.NUMERIC_NA_TOKEN = '@'[source]#
realtabformer.data_utils.INVALID_NUMS_RE = '[^\\-.0-9]'[source]#
class realtabformer.data_utils.TabularArtefact[source]#
best_disc_model: str = 'best-disc-model'[source]#
mean_best_disc_model: str = 'mean-best-disc-model'[source]#
not_best_disc_model: str = 'not-best-disc-model'[source]#
last_epoch_model: str = 'last-epoch-model'[source]#
static artefacts()[source]#
class realtabformer.data_utils.ModelFileName[source]#
rtf_config_json: str = 'rtf_config.json'[source]#
rtf_model_pt: str = 'rtf_model.pt'[source]#
static names()[source]#
class realtabformer.data_utils.ModelType[source]#
tabular: str = 'tabular'[source]#
relational: str = 'relational'[source]#
static types()[source]#
class realtabformer.data_utils.ColDataType[source]#
NUMERIC: str = 'NUMERIC'[source]#
DATETIME: str = 'DATETIME'[source]#
CATEGORICAL: str = 'CATEGORICAL'[source]#
static types()[source]#
class realtabformer.data_utils.SpecialTokens[source]#
UNK: str = '[UNK]'[source]#
SEP: str = '[SEP]'[source]#
PAD: str = '[PAD]'[source]#
CLS: str = '[CLS]'[source]#
MASK: str = '[MASK]'[source]#
BOS: str = '[BOS]'[source]#
EOS: str = '[EOS]'[source]#
BMEM: str = '[BMEM]'[source]#
EMEM: str = '[EMEM]'[source]#
RMASK: str = '[RMASK]'[source]#
SPTYPE: str = '[SPTYPE]'[source]#
static tokens()[source]#
realtabformer.data_utils.get_uuid()[source]#
realtabformer.data_utils.fix_multi_decimal(v)[source]#
realtabformer.data_utils.build_vocab(df: pandas.DataFrame = None, special_tokens=None, add_columns: bool = True)[source]#
realtabformer.data_utils.process_numeric_data(series: pandas.Series, max_len: int = 10, numeric_precision: int = 4, transform_data: Dict = None) Tuple[pandas.Series, Dict][source]#
realtabformer.data_utils.process_datetime_data(series, transform_data: Dict = None) Tuple[pandas.Series, Dict][source]#
realtabformer.data_utils.process_categorical_data(series: pandas.Series) pandas.Series[source]#
realtabformer.data_utils.encode_partition_numeric_col(col, tr, col_zfill)[source]#
realtabformer.data_utils.decode_partition_numeric_col(partition_col)[source]#
realtabformer.data_utils.tokenize_numeric_col(series: pandas.Series, nparts=2, col_zfill=2)[source]#
realtabformer.data_utils.encode_column_values(series)[source]#
realtabformer.data_utils.decode_column_values(series)[source]#
realtabformer.data_utils.encode_processed_column(idx, dtype, col)[source]#
realtabformer.data_utils.decode_processed_column(col)[source]#
realtabformer.data_utils.extract_processed_column(col)[source]#
realtabformer.data_utils.is_numeric_col(col)[source]#
realtabformer.data_utils.is_datetime_col(col)[source]#
realtabformer.data_utils.is_categorical_col(col)[source]#
realtabformer.data_utils.is_numeric_datetime_col(col)[source]#
realtabformer.data_utils.process_data(df: pandas.DataFrame, numeric_max_len=10, numeric_precision=4, numeric_nparts=2, first_col_type=None, col_transform_data: Dict = None, target_col: str = None) Tuple[pandas.DataFrame, Dict][source]#
realtabformer.data_utils.get_token_id(token: str, vocab_token2id: Dict[str, int], mask_rate: float = 0) int[source]#
realtabformer.data_utils.get_input_ids(example, vocab: Dict, columns: List, mask_rate: float = 0, return_label_ids: bool | None = True, return_token_type_ids: bool | None = False, affix_bos: bool | None = True, affix_eos: bool | None = True) Dict[source]#
realtabformer.data_utils.make_dataset(df: pandas.DataFrame, vocab: Dict, mask_rate: float = 0, affix_eos: bool = True, return_token_type_ids: bool = False) datasets.Dataset[source]#
realtabformer.data_utils.get_relational_input_ids(example, input_idx, vocab, columns, output_dataset, in_out_idx, output_max_length: int | None = None, return_token_type_ids: bool = False) dict[source]#
realtabformer.data_utils.make_relational_dataset(in_df: pandas.DataFrame, out_df: pandas.DataFrame, vocab: dict, in_out_idx: dict, mask_rate=0, output_max_length: int | None = None, return_token_type_ids: bool = False) datasets.Dataset[source]#