Source code for scripts.train.trainers.dataset

import json
import os
import time
from typing import List

import pandas as pd


[docs]class LineClassifierDataset: """ Dataset of lines in form of a feature matrix (with already extracted features) """
[docs] def __init__(self, dataframe: pd.DataFrame, feature_list: List[str], group_name: str, label_name: str, text_name: str) -> None: """ :param dataframe: pandas dataframe with features and metadata, columns are features (and metadata), rows are documents' lines :param feature_list: list of feature columns' names :param group_name: name of the group column, e.g., "document" (dataset is split into groups, one group of lines is related to one document) :param label_name: name of the label column, e.g., "label" :param text_name: name of the column with lines' text, e.g., "text" """ self.label_name = label_name self.group_name = group_name self.feature_list = feature_list self.text_name = text_name self.dataframe = dataframe
@property def features(self) -> pd.DataFrame: """ pandas dataframe with features, columns are features, rows are documents' lines """ return self.dataframe[self.feature_list] @property def labels(self) -> pd.Series: """ a vector of labels for each line """ return self.dataframe[self.label_name]
[docs] def save(self, path: str = "/tmp", csv_only: bool = False) -> str: """ :param path: path to the directory where the dataset will be saved :param csv_only: whether to save only csv-file instead of saving dataset as a directory with csv, pkl and json files :return: path to the directory where the dataset is saved """ if csv_only: self.dataframe.to_csv(os.path.join(path, "dataset.csv")) return path dir_out = os.path.join(path, f"dataset_{int(time.time() * 1000)}") os.mkdir(dir_out) self.dataframe.to_csv(os.path.join(dir_out, "dataset.csv")) self.dataframe.to_pickle(os.path.join(dir_out, "dataset.pkl.gz")) with open(os.path.join(dir_out, "description.json"), "w") as out: d = dict(label_name=self.label_name, group_name=self.group_name, text_name=self.text_name, feature_list=self.feature_list) json.dump(obj=d, fp=out, ensure_ascii=False, indent=4) return dir_out