BaseDataLoader

Bases: BaseLoader, ABC

Abstract base class for loading, encoding, and scaling processed data.

This class provides methods for loading and saving processed data, verifying encoded and scaled columns, and defines abstract methods for encoding and scaling that must be implemented by subclasses.

Inherits

BaseLoader: Provides loading and saving capabilities for processed data.
ABC: Specifies abstract methods for subclasses to implement.

Parameters:

Name	Type	Description	Default
`task`	`str`	Specifies the task column name.	required
`encoding`	`Optional[str]`	Defines the encoding method for categorical columns. Options include 'one_hot', 'target', or None.	required
`encode`	`bool`	If True, applies encoding to categorical columns.	required
`scale`	`bool`	If True, applies scaling to numeric columns.	required

Attributes:

Name	Type	Description
`task`	`str`	Task column name used in transformations.
`encoding`	`str`	Encoding method specified for categorical columns.
`encode`	`bool`	Flag to apply encoding to categorical columns.
`scale`	`bool`	Flag to apply scaling to numeric columns.

Methods:

Name	Description
`load_data`	Load processed data from the specified path and file.
`save_data`	Save processed data to the specified path and file.

Abstract Methods

encode_categorical_columns: Encodes categorical columns in the DataFrame.
scale_numeric_columns: Scales numeric columns in the DataFrame.
transform_data: Processes and transforms the data.

Source code in periomod/data/_basedata.py

class BaseDataLoader(BaseLoader, ABC):
    """Abstract base class for loading, encoding, and scaling processed data.

    This class provides methods for loading and saving processed data, verifying
    encoded and scaled columns, and defines abstract methods for encoding and scaling
    that must be implemented by subclasses.

    Inherits:
        - `BaseLoader`: Provides loading and saving capabilities for processed data.
        - `ABC`: Specifies abstract methods for subclasses to implement.

    Args:
        task (str): Specifies the task column name.
        encoding (Optional[str]): Defines the encoding method for categorical columns.
            Options include 'one_hot', 'target', or None.
        encode (bool): If True, applies encoding to categorical columns.
        scale (bool): If True, applies scaling to numeric columns.

    Attributes:
        task (str): Task column name used in transformations.
        encoding (str): Encoding method specified for categorical columns.
        encode (bool): Flag to apply encoding to categorical columns.
        scale (bool): Flag to apply scaling to numeric columns.

    Methods:
        load_data: Load processed data from the specified path and file.
        save_data: Save processed data to the specified path and file.

    Abstract Methods:
        - `encode_categorical_columns`: Encodes categorical columns in the DataFrame.
        - `scale_numeric_columns`: Scales numeric columns in the DataFrame.
        - `transform_data`: Processes and transforms the data.
    """

    def __init__(
        self, task: str, encoding: Optional[str], encode: bool, scale: bool
    ) -> None:
        """Initializes the ProcessedDataLoader with the specified task column."""
        super().__init__()
        self.task = task
        self.encoding = encoding
        self.encode = encode
        self.scale = scale

    @staticmethod
    def load_data(
        path: Union[str, Path] = Path("data/processed/processed_data.csv"),
    ) -> pd.DataFrame:
        """Loads the processed data from the specified path, with lowercasing.

        Args:
            path (str): Directory path for the processed data.

        Returns:
            pd.DataFrame: Loaded DataFrame with lowercase column names.
        """
        path = Path(path)

        if not path.is_absolute():
            path = Path.cwd() / path

        if not path.exists():
            raise FileNotFoundError(f"File not found: {path}")

        return pd.read_csv(path).rename(columns=str.lower)

    def save_data(
        self,
        df: pd.DataFrame,
        path: Union[str, Path] = Path("data/training/training_data.csv"),
    ) -> None:
        """Saves the processed DataFrame to a CSV file.

        Args:
            df (pd.DataFrame): The processed DataFrame.
            path (str, optional): Directory where dataset is saved.
                Path("data/training/training_data.csv".
        """
        super().save_data(df=df, path=path)

    def _check_encoded_columns(self, df: pd.DataFrame) -> None:
        """Verifies that categorical columns were correctly one-hot or target encoded.

        Args:
            df (pd.DataFrame): The DataFrame to check.

        Raises:
            ValueError: If columns are not correctly encoded.
        """
        if self.encoding == "one_hot":
            cat_vars = [col for col in self.all_cat_vars if col in df.columns]

            for col in cat_vars:
                if col in df.columns:
                    raise ValueError(
                        f"Column '{col}' was not correctly one-hot encoded."
                    )
                matching_columns = [c for c in df.columns if c.startswith(f"{col}_")]
                if not matching_columns:
                    raise ValueError(f"No one-hot encoded columns for '{col}'.")
        elif self.encoding == "target":
            if "toothside" not in df.columns:
                raise ValueError("Target encoding for 'toothside' failed.")
        elif self.encoding is None:
            print("No encoding was applied.")
        else:
            raise ValueError(f"Invalid encoding '{self.encoding}'.")

    def _check_scaled_columns(self, df: pd.DataFrame) -> None:
        """Verifies that scaled columns are within expected ranges.

        Args:
            df (pd.DataFrame): The DataFrame to check.

        Raises:
            ValueError: If any columns are not correctly scaled.
        """
        if self.scale:
            for col in self.scale_vars:
                scaled_min = df[col].min()
                scaled_max = df[col].max()
                if scaled_min < -10 or scaled_max > 20:
                    raise ValueError(f"Column {col} is not correctly scaled.")

    @abstractmethod
    def encode_categorical_columns(self, df: pd.DataFrame):
        """Encodes categorical columns in the DataFrame.

        Args:
            df (pd.DataFrame): The DataFrame containing categorical columns.
        """

    @abstractmethod
    def scale_numeric_columns(self, df: pd.DataFrame):
        """Scales numeric columns in the DataFrame.

        Args:
            df (pd.DataFrame): The DataFrame containing numeric columns.
        """

    @abstractmethod
    def transform_data(self, df: pd.DataFrame):
        """Processes and transforms the data.

        Args:
            df (pd.DataFrame): The DataFrame to transform.
        """

`init(task, encoding, encode, scale)` ¶

Initializes the ProcessedDataLoader with the specified task column.

Source code in periomod/data/_basedata.py

def __init__(
    self, task: str, encoding: Optional[str], encode: bool, scale: bool
) -> None:
    """Initializes the ProcessedDataLoader with the specified task column."""
    super().__init__()
    self.task = task
    self.encoding = encoding
    self.encode = encode
    self.scale = scale

`encode_categorical_columns(df)` `abstractmethod` ¶

Encodes categorical columns in the DataFrame.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	The DataFrame containing categorical columns.	required

Source code in periomod/data/_basedata.py

@abstractmethod
def encode_categorical_columns(self, df: pd.DataFrame):
    """Encodes categorical columns in the DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame containing categorical columns.
    """

`load_data(path=Path('data/processed/processed_data.csv'))` `staticmethod` ¶

Loads the processed data from the specified path, with lowercasing.

Parameters:

Name	Type	Description	Default
`path`	`str`	Directory path for the processed data.	`Path('data/processed/processed_data.csv')`

Returns:

Type	Description
`DataFrame`	pd.DataFrame: Loaded DataFrame with lowercase column names.

Source code in periomod/data/_basedata.py

@staticmethod
def load_data(
    path: Union[str, Path] = Path("data/processed/processed_data.csv"),
) -> pd.DataFrame:
    """Loads the processed data from the specified path, with lowercasing.

    Args:
        path (str): Directory path for the processed data.

    Returns:
        pd.DataFrame: Loaded DataFrame with lowercase column names.
    """
    path = Path(path)

    if not path.is_absolute():
        path = Path.cwd() / path

    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")

    return pd.read_csv(path).rename(columns=str.lower)

`save_data(df, path=Path('data/training/training_data.csv'))` ¶

Saves the processed DataFrame to a CSV file.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	The processed DataFrame.	required
`path`	`str`	Directory where dataset is saved. Path("data/training/training_data.csv".	`Path('data/training/training_data.csv')`

Source code in periomod/data/_basedata.py

def save_data(
    self,
    df: pd.DataFrame,
    path: Union[str, Path] = Path("data/training/training_data.csv"),
) -> None:
    """Saves the processed DataFrame to a CSV file.

    Args:
        df (pd.DataFrame): The processed DataFrame.
        path (str, optional): Directory where dataset is saved.
            Path("data/training/training_data.csv".
    """
    super().save_data(df=df, path=path)

`scale_numeric_columns(df)` `abstractmethod` ¶

Scales numeric columns in the DataFrame.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	The DataFrame containing numeric columns.	required

Source code in periomod/data/_basedata.py

@abstractmethod
def scale_numeric_columns(self, df: pd.DataFrame):
    """Scales numeric columns in the DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame containing numeric columns.
    """

`transform_data(df)` `abstractmethod` ¶

Processes and transforms the data.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	The DataFrame to transform.	required

Source code in periomod/data/_basedata.py

@abstractmethod
def transform_data(self, df: pd.DataFrame):
    """Processes and transforms the data.

    Args:
        df (pd.DataFrame): The DataFrame to transform.
    """

BaseDataLoader

__init__(task, encoding, encode, scale) ¶

encode_categorical_columns(df) abstractmethod ¶

load_data(path=Path('data/processed/processed_data.csv')) staticmethod ¶

save_data(df, path=Path('data/training/training_data.csv')) ¶

scale_numeric_columns(df) abstractmethod ¶

transform_data(df) abstractmethod ¶

`init(task, encoding, encode, scale)` ¶

`encode_categorical_columns(df)` `abstractmethod` ¶

`load_data(path=Path('data/processed/processed_data.csv'))` `staticmethod` ¶

`save_data(df, path=Path('data/training/training_data.csv'))` ¶

`scale_numeric_columns(df)` `abstractmethod` ¶

`transform_data(df)` `abstractmethod` ¶