ProcessDataHelper

Bases: BaseConfig

Helper class for processing periodontal data with utility methods.

This class provides methods for evaluating tooth infection status, calculating adjacent infected teeth, and imputing values for 'plaque' and 'furcationbaseline' columns based on predefined rules and conditions.

Inherits

BaseConfig: Provides configuration settings for data processing.

Attributes:

Name	Type	Description
`teeth_neighbors`	`dict`	Dictionary mapping each tooth to its adjacent neighbors.
`sides_with_fur`	`dict`	Dictionary specifying teeth with furcations and their respective sides.

Methods:

Name	Description
`check_infection`	Evaluates infection status based on pocket depth and BOP values.
`get_adjacent_infected_teeth_count`	Adds a column to indicate the count of adjacent infected teeth for each tooth.
`plaque_imputation`	Imputes values in the 'plaque' column.
`fur_imputation`	Imputes values in the 'furcationbaseline' column.

Example

helper = ProcessDataHelper()
data = helper.plaque_imputation(data)
data = helper.fur_imputation(data)
infected_count_data = helper.get_adjacent_infected_teeth_count(
    data, patient_col="id_patient", tooth_col="tooth",
    infection_col="infection"
)

Source code in periomod/data/_helpers.py

class ProcessDataHelper(BaseConfig):
    """Helper class for processing periodontal data with utility methods.

    This class provides methods for evaluating tooth infection status,
    calculating adjacent infected teeth, and imputing values for 'plaque' and
    'furcationbaseline' columns based on predefined rules and conditions.

    Inherits:
        - `BaseConfig`: Provides configuration settings for data processing.

    Attributes:
        teeth_neighbors (dict): Dictionary mapping each tooth to its adjacent
            neighbors.
        sides_with_fur (dict): Dictionary specifying teeth with furcations and
            their respective sides.

    Methods:
        check_infection: Evaluates infection status based on pocket depth and
            BOP values.
        get_adjacent_infected_teeth_count: Adds a column to indicate the count
            of adjacent infected teeth for each tooth.
        plaque_imputation: Imputes values in the 'plaque' column.
        fur_imputation: Imputes values in the 'furcationbaseline' column.

    Example:
        ```
        helper = ProcessDataHelper()
        data = helper.plaque_imputation(data)
        data = helper.fur_imputation(data)
        infected_count_data = helper.get_adjacent_infected_teeth_count(
            data, patient_col="id_patient", tooth_col="tooth",
            infection_col="infection"
        )
        ```
    """

    def __init__(self):
        """Initialize Preprocessor with helper data without storing the DataFrame."""
        super().__init__()
        self.teeth_neighbors = _get_teeth_neighbors()
        self.sides_with_fur = _get_side()

    @staticmethod
    def check_infection(depth: int, boprevaluation: int) -> int:
        """Check if a given tooth side is infected.

        Args:
            depth: the depth of the pocket before the therapy
            boprevaluation: the value of BOP evaluation for the tooth side

        Returns:
            1 if the tooth side is infected, otherwise 0.
        """
        if depth > 4:
            return 1
        elif depth == 4 and boprevaluation == 2:
            return 1
        return 0

    def _tooth_neighbor(self, nr: int) -> Union[np.ndarray, str]:
        """Returns adjacent teeth for a given tooth.

        Args:
            nr (int): tooth number (11-48)

        Returns:
            Union[np.ndarray, str]: Array of adjacent teeth, or 'No tooth'
            if input is invalid.
        """
        return np.array(self.teeth_neighbors.get(nr, "No tooth"))

    def get_adjacent_infected_teeth_count(
        self, data: pd.DataFrame, patient_col: str, tooth_col: str, infection_col: str
    ) -> pd.DataFrame:
        """Adds a new column indicating the number of adjacent infected teeth.

        Args:
            data (pd.DataFrame): Dataset to process.
            patient_col (str): Name of column containing ID for patients.
            tooth_col (str): Name of column containing teeth represented in numbers.
            infection_col (str): Name of column indicating whether a tooth is healthy.

        Returns:
            pd.DataFrame: Modified dataset with new column 'infected_neighbors'.
        """
        for patient_id, patient_data in data.groupby(patient_col):
            infected_teeth = set(
                patient_data[patient_data[infection_col] == 1][tooth_col]
            )

            data.loc[data[patient_col] == patient_id, "infected_neighbors"] = (
                patient_data[tooth_col].apply(
                    lambda tooth, infected_teeth=infected_teeth: sum(
                        1
                        for neighbor in self._tooth_neighbor(nr=tooth)
                        if neighbor in infected_teeth
                    )
                )
            )

        return data

    @staticmethod
    def _plaque_values(row: pd.Series, modes_dict: dict) -> int:
        """Calculate new values for the Plaque column.

        Args:
            row (pd.Series): A row from the DataFrame.
            modes_dict (dict): Dict mapping (tooth, side, pdbaseline_grouped)
            to the mode plaque value.

        Returns:
            int: Imputed plaque value for the given row.
        """
        if row["plaque_all_na"] == 1:
            key = (row["tooth"], row["side"], row["pdbaseline_grouped"])
            mode_value = modes_dict.get(key, None)
            if mode_value is not None:
                if isinstance(mode_value, tuple) and 2 in mode_value:
                    return 2
                elif mode_value == 1:
                    return 1
                elif mode_value == 2:
                    return 2
        else:
            if pd.isna(row["plaque"]):
                return 1
            else:
                return row["plaque"]
        return 1

    def plaque_imputation(self, data: pd.DataFrame) -> pd.DataFrame:
        """Imputes values for Plaque without affecting other columns.

        Args:
            data (pd.DataFrame): Input DataFrame with a 'plaque' column.

        Returns:
            pd.DataFrame: The DataFrame with the imputed 'plaque' values.

        Raises:
            KeyError: If plaque column is not found in DataFrame.
        """
        data.columns = [col.lower() for col in data.columns]

        if "plaque" not in data.columns:
            raise KeyError("'plaque' column not found in the DataFrame")
        data["plaque"] = pd.to_numeric(data["plaque"], errors="coerce")

        conditions_baseline = [
            data["pdbaseline"] <= 3,
            (data["pdbaseline"] >= 4) & (data["pdbaseline"] <= 5),
            data["pdbaseline"] >= 6,
        ]
        choices_baseline = [0, 1, 2]
        data["pdbaseline_grouped"] = np.select(
            conditions_baseline, choices_baseline, default=-1
        )

        patients_with_all_nas = data.groupby(self.group_col)["plaque"].apply(
            lambda x: all(pd.isna(x))
        )
        data["plaque_all_na"] = data[self.group_col].isin(
            patients_with_all_nas[patients_with_all_nas].index
        )

        grouped_data = data.groupby(["tooth", "side", "pdbaseline_grouped"])

        modes_dict = {}
        for (tooth, side, baseline_grouped), group in grouped_data:
            modes = group["plaque"].mode()
            mode_value = modes.iloc[0] if not modes.empty else None
            modes_dict[(tooth, side, baseline_grouped)] = mode_value

        data["plaque"] = data.apply(
            lambda row: self._plaque_values(row=row, modes_dict=modes_dict), axis=1
        )

        data = data.drop(["pdbaseline_grouped", "plaque_all_na"], axis=1)

        return data

    def _fur_side(self, nr: int) -> Union[np.ndarray, str]:
        """Returns the sides for the input tooth that should have furcations.

        Args:
            nr (int): Tooth number.

        Returns:
            Union[np.ndarray, str]: Sides with furcations, or 'without Furkation'
            if not applicable.
        """
        for key, value in self.sides_with_fur.items():
            if nr in key:
                return np.array(value)
        return "Tooth without Furkation"

    def _fur_values(self, row: pd.Series) -> int:
        """Calculate values for the FurcationBaseline column.

        Args:
            row (pd.Series): A row from the DataFrame.

        Returns:
            int: Imputed value for furcationbaseline.

        Raises:
            ValueError: If NaN is found in pd- or recbaseline.
        """
        tooth_fur = [14, 16, 17, 18, 24, 26, 27, 28, 36, 37, 38, 46, 47, 48]
        if pd.isna(row["pdbaseline"]) or pd.isna(row["recbaseline"]):
            raise ValueError(
                "NaN found in pdbaseline or recbaseline. Check RecBaseline imputation."
            )

        if row["furcationbaseline_all_na"] == 1:
            if row["tooth"] in tooth_fur:
                if row["side"] in self._fur_side(nr=row["tooth"]):
                    if (row["pdbaseline"] + row["recbaseline"]) < 4:
                        return 0
                    elif 3 < (row["pdbaseline"] + row["recbaseline"]) < 6:
                        return 1
                    else:
                        return 2
                else:
                    return 0
            else:
                return 0
        else:
            if pd.isna(row["furcationbaseline"]):
                return 0
            else:
                return row["furcationbaseline"]

    def fur_imputation(self, data: pd.DataFrame) -> pd.DataFrame:
        """Impute the values in the FurcationBaseline column.

        Args:
            data (pd.DataFrame): Input DataFrame.

        Returns:
            pd.DataFrame: The DataFrame with imputed values for 'furcationbaseline'.

        Raises:
            KeyError: If furcationbaseline is not found in DatFrame.
        """
        if "furcationbaseline" not in data.columns:
            raise KeyError("'furcationbaseline' column not found in the DataFrame")

        patients_with_all_nas = data.groupby(self.group_col)["furcationbaseline"].apply(
            lambda x: all(pd.isna(x))
        )
        data["furcationbaseline_all_na"] = data[self.group_col].isin(
            patients_with_all_nas[patients_with_all_nas].index
        )

        data["furcationbaseline"] = data.apply(self._fur_values, axis=1)
        data = data.drop(["furcationbaseline_all_na"], axis=1)

        return data

`init()` ¶

Initialize Preprocessor with helper data without storing the DataFrame.

Source code in periomod/data/_helpers.py

def __init__(self):
    """Initialize Preprocessor with helper data without storing the DataFrame."""
    super().__init__()
    self.teeth_neighbors = _get_teeth_neighbors()
    self.sides_with_fur = _get_side()

`check_infection(depth, boprevaluation)` `staticmethod` ¶

Check if a given tooth side is infected.

Parameters:

Name	Type	Description	Default
`depth`	`int`	the depth of the pocket before the therapy	required
`boprevaluation`	`int`	the value of BOP evaluation for the tooth side	required

Returns:

Type	Description
`int`	1 if the tooth side is infected, otherwise 0.

Source code in periomod/data/_helpers.py

@staticmethod
def check_infection(depth: int, boprevaluation: int) -> int:
    """Check if a given tooth side is infected.

    Args:
        depth: the depth of the pocket before the therapy
        boprevaluation: the value of BOP evaluation for the tooth side

    Returns:
        1 if the tooth side is infected, otherwise 0.
    """
    if depth > 4:
        return 1
    elif depth == 4 and boprevaluation == 2:
        return 1
    return 0

`fur_imputation(data)` ¶

Impute the values in the FurcationBaseline column.

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	Input DataFrame.	required

Returns:

Type	Description
`DataFrame`	pd.DataFrame: The DataFrame with imputed values for 'furcationbaseline'.

Raises:

Type	Description
`KeyError`	If furcationbaseline is not found in DatFrame.

Source code in periomod/data/_helpers.py

def fur_imputation(self, data: pd.DataFrame) -> pd.DataFrame:
    """Impute the values in the FurcationBaseline column.

    Args:
        data (pd.DataFrame): Input DataFrame.

    Returns:
        pd.DataFrame: The DataFrame with imputed values for 'furcationbaseline'.

    Raises:
        KeyError: If furcationbaseline is not found in DatFrame.
    """
    if "furcationbaseline" not in data.columns:
        raise KeyError("'furcationbaseline' column not found in the DataFrame")

    patients_with_all_nas = data.groupby(self.group_col)["furcationbaseline"].apply(
        lambda x: all(pd.isna(x))
    )
    data["furcationbaseline_all_na"] = data[self.group_col].isin(
        patients_with_all_nas[patients_with_all_nas].index
    )

    data["furcationbaseline"] = data.apply(self._fur_values, axis=1)
    data = data.drop(["furcationbaseline_all_na"], axis=1)

    return data

`get_adjacent_infected_teeth_count(data, patient_col, tooth_col, infection_col)` ¶

Adds a new column indicating the number of adjacent infected teeth.

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	Dataset to process.	required
`patient_col`	`str`	Name of column containing ID for patients.	required
`tooth_col`	`str`	Name of column containing teeth represented in numbers.	required
`infection_col`	`str`	Name of column indicating whether a tooth is healthy.	required

Returns:

Type	Description
`DataFrame`	pd.DataFrame: Modified dataset with new column 'infected_neighbors'.

Source code in periomod/data/_helpers.py

def get_adjacent_infected_teeth_count(
    self, data: pd.DataFrame, patient_col: str, tooth_col: str, infection_col: str
) -> pd.DataFrame:
    """Adds a new column indicating the number of adjacent infected teeth.

    Args:
        data (pd.DataFrame): Dataset to process.
        patient_col (str): Name of column containing ID for patients.
        tooth_col (str): Name of column containing teeth represented in numbers.
        infection_col (str): Name of column indicating whether a tooth is healthy.

    Returns:
        pd.DataFrame: Modified dataset with new column 'infected_neighbors'.
    """
    for patient_id, patient_data in data.groupby(patient_col):
        infected_teeth = set(
            patient_data[patient_data[infection_col] == 1][tooth_col]
        )

        data.loc[data[patient_col] == patient_id, "infected_neighbors"] = (
            patient_data[tooth_col].apply(
                lambda tooth, infected_teeth=infected_teeth: sum(
                    1
                    for neighbor in self._tooth_neighbor(nr=tooth)
                    if neighbor in infected_teeth
                )
            )
        )

    return data

`plaque_imputation(data)` ¶

Imputes values for Plaque without affecting other columns.

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	Input DataFrame with a 'plaque' column.	required

Returns:

Type	Description
`DataFrame`	pd.DataFrame: The DataFrame with the imputed 'plaque' values.

Raises:

Type	Description
`KeyError`	If plaque column is not found in DataFrame.

Source code in periomod/data/_helpers.py

def plaque_imputation(self, data: pd.DataFrame) -> pd.DataFrame:
    """Imputes values for Plaque without affecting other columns.

    Args:
        data (pd.DataFrame): Input DataFrame with a 'plaque' column.

    Returns:
        pd.DataFrame: The DataFrame with the imputed 'plaque' values.

    Raises:
        KeyError: If plaque column is not found in DataFrame.
    """
    data.columns = [col.lower() for col in data.columns]

    if "plaque" not in data.columns:
        raise KeyError("'plaque' column not found in the DataFrame")
    data["plaque"] = pd.to_numeric(data["plaque"], errors="coerce")

    conditions_baseline = [
        data["pdbaseline"] <= 3,
        (data["pdbaseline"] >= 4) & (data["pdbaseline"] <= 5),
        data["pdbaseline"] >= 6,
    ]
    choices_baseline = [0, 1, 2]
    data["pdbaseline_grouped"] = np.select(
        conditions_baseline, choices_baseline, default=-1
    )

    patients_with_all_nas = data.groupby(self.group_col)["plaque"].apply(
        lambda x: all(pd.isna(x))
    )
    data["plaque_all_na"] = data[self.group_col].isin(
        patients_with_all_nas[patients_with_all_nas].index
    )

    grouped_data = data.groupby(["tooth", "side", "pdbaseline_grouped"])

    modes_dict = {}
    for (tooth, side, baseline_grouped), group in grouped_data:
        modes = group["plaque"].mode()
        mode_value = modes.iloc[0] if not modes.empty else None
        modes_dict[(tooth, side, baseline_grouped)] = mode_value

    data["plaque"] = data.apply(
        lambda row: self._plaque_values(row=row, modes_dict=modes_dict), axis=1
    )

    data = data.drop(["pdbaseline_grouped", "plaque_all_na"], axis=1)

    return data

ProcessDataHelper

__init__() ¶

check_infection(depth, boprevaluation) staticmethod ¶

fur_imputation(data) ¶

get_adjacent_infected_teeth_count(data, patient_col, tooth_col, infection_col) ¶

plaque_imputation(data) ¶

`init()` ¶

`check_infection(depth, boprevaluation)` `staticmethod` ¶

`fur_imputation(data)` ¶

`get_adjacent_infected_teeth_count(data, patient_col, tooth_col, infection_col)` ¶

`plaque_imputation(data)` ¶