Skip to content

policyML.raw_dataset_preprocessing

[docs] module policyML.raw_dataset_preprocessing

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
"""
Raw data preparations for the policyML project.

Phase 1: Splitting the dataset into historic and new datasets (used for training).
Phase 2: Splitting the new dataset into monthly datasets (used for inference).
"""

import logging
from pathlib import Path

import pandas as pd
from sklearn.model_selection import train_test_split

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def split_dataset_into_historic_and_new(data_dir: Path = Path("../data/raw")) -> None:
    """
    Splits a training dataset into two disjoint datasets: historic and new.

    The function loads a CSV file named 'trainset.csv' from the specified directory,
    performs a 50/50 random split, verifies disjointness, and saves the resulting
    subsets as 'historic_dataset.csv' and 'new_dataset.csv'.

    Parameters:
        data_dir (Path): Directory where 'trainset.csv' is located and where the
                         output files will be saved.
    """
    input_path = data_dir / "source" / "trainset.csv"
    historic_path = data_dir / "historic_dataset.csv"
    new_path = data_dir / "new_dataset.csv"

    logger.info(f"Loading dataset from: {input_path}")
    df = pd.read_csv(input_path)

    historic_dataset, new_dataset = train_test_split(df, test_size=0.5, random_state=42)

    if not historic_dataset.merge(new_dataset).empty:
        raise ValueError("Datasets are not disjoint")

    logger.info(f"Saving historic dataset to: {historic_path}")
    historic_dataset.to_csv(historic_path, index=False)

    logger.info(f"Saving new dataset to: {new_path}")
    new_dataset.to_csv(new_path, index=False)


## TODO: Implement Phase 2


def monthly_split_new_dataset(data_dir: Path = Path("../data/raw")) -> None:
    """
    Splits the new dataset into monthly datasets for inference.

    This function is a placeholder for future implementation.
    It will take the 'new_dataset.csv' and split it into monthly datasets.
    """
    logger.info("Monthly split of new dataset is not yet implemented.")
    # Implementation will go here in the future
    pass