Skip to content

ProcessDataHelper

Bases: BaseConfig

Helper class for processing periodontal data with utility methods.

This class provides methods for evaluating tooth infection status, calculating adjacent infected teeth, and imputing values for 'plaque' and 'furcationbaseline' columns based on predefined rules and conditions.

Inherits
  • BaseConfig: Provides configuration settings for data processing.

Attributes:

Name Type Description
teeth_neighbors dict

Dictionary mapping each tooth to its adjacent neighbors.

sides_with_fur dict

Dictionary specifying teeth with furcations and their respective sides.

Methods:

Name Description
check_infection

Evaluates infection status based on pocket depth and BOP values.

get_adjacent_infected_teeth_count

Adds a column to indicate the count of adjacent infected teeth for each tooth.

plaque_imputation

Imputes values in the 'plaque' column.

fur_imputation

Imputes values in the 'furcationbaseline' column.

Example
helper = ProcessDataHelper()
df = helper.plaque_imputation(df)
df = helper.fur_imputation(df)
infected_count_df = helper.get_adjacent_infected_teeth_count(
    df, patient_col="id_patient", tooth_col="tooth",
    infection_col="infection"
)
Source code in periomod/data/_helpers.py
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
class ProcessDataHelper(BaseConfig):
    """Helper class for processing periodontal data with utility methods.

    This class provides methods for evaluating tooth infection status,
    calculating adjacent infected teeth, and imputing values for 'plaque' and
    'furcationbaseline' columns based on predefined rules and conditions.

    Inherits:
        - `BaseConfig`: Provides configuration settings for data processing.

    Attributes:
        teeth_neighbors (dict): Dictionary mapping each tooth to its adjacent
            neighbors.
        sides_with_fur (dict): Dictionary specifying teeth with furcations and
            their respective sides.

    Methods:
        check_infection: Evaluates infection status based on pocket depth and
            BOP values.
        get_adjacent_infected_teeth_count: Adds a column to indicate the count
            of adjacent infected teeth for each tooth.
        plaque_imputation: Imputes values in the 'plaque' column.
        fur_imputation: Imputes values in the 'furcationbaseline' column.

    Example:
        ```
        helper = ProcessDataHelper()
        df = helper.plaque_imputation(df)
        df = helper.fur_imputation(df)
        infected_count_df = helper.get_adjacent_infected_teeth_count(
            df, patient_col="id_patient", tooth_col="tooth",
            infection_col="infection"
        )
        ```
    """

    def __init__(self):
        """Initialize Preprocessor with helper data without storing the DataFrame."""
        super().__init__()
        self.teeth_neighbors = _get_teeth_neighbors()
        self.sides_with_fur = _get_side()

    @staticmethod
    def check_infection(depth: int, boprevaluation: int) -> int:
        """Check if a given tooth side is infected.

        Args:
            depth: the depth of the pocket before the therapy
            boprevaluation: the value of BOP evaluation for the tooth side

        Returns:
            1 if the tooth side is infected, otherwise 0.
        """
        if depth > 4:
            return 1
        elif depth == 4 and boprevaluation == 2:
            return 1
        return 0

    def _tooth_neighbor(self, nr: int) -> Union[np.ndarray, str]:
        """Returns adjacent teeth for a given tooth.

        Args:
            nr (int): tooth number (11-48)

        Returns:
            Union[np.ndarray, str]: Array of adjacent teeth, or 'No tooth'
            if input is invalid.
        """
        return np.array(self.teeth_neighbors.get(nr, "No tooth"))

    def get_adjacent_infected_teeth_count(
        self, df: pd.DataFrame, patient_col: str, tooth_col: str, infection_col: str
    ) -> pd.DataFrame:
        """Adds a new column indicating the number of adjacent infected teeth.

        Args:
            df (pd.DataFrame): Dataset to process.
            patient_col (str): Name of column containing ID for patients.
            tooth_col (str): Name of column containing teeth represented in numbers.
            infection_col (str): Name of column indicating whether a tooth is healthy.

        Returns:
            pd.DataFrame: Modified dataset with new column 'infected_neighbors'.
        """
        for patient_id, patient_data in df.groupby(patient_col):
            infected_teeth = set(
                patient_data[patient_data[infection_col] == 1][tooth_col]
            )

            df.loc[df[patient_col] == patient_id, "infected_neighbors"] = patient_data[
                tooth_col
            ].apply(
                lambda tooth, infected_teeth=infected_teeth: sum(
                    1
                    for neighbor in self._tooth_neighbor(nr=tooth)
                    if neighbor in infected_teeth
                )
            )

        return df

    @staticmethod
    def _plaque_values(row: pd.Series, modes_dict: dict) -> int:
        """Calculate new values for the Plaque column.

        Args:
            row (pd.Series): A row from the DataFrame.
            modes_dict (dict): Dict mapping (tooth, side, pdbaseline_grouped)
            to the mode plaque value.

        Returns:
            int: Imputed plaque value for the given row.
        """
        if row["plaque_all_na"] == 1:
            key = (row["tooth"], row["side"], row["pdbaseline_grouped"])
            mode_value = modes_dict.get(key, None)
            if mode_value is not None:
                if isinstance(mode_value, tuple) and 2 in mode_value:
                    return 2
                elif mode_value == 1:
                    return 1
                elif mode_value == 2:
                    return 2
        else:
            if pd.isna(row["plaque"]):
                return 1
            else:
                return row["plaque"]
        return 1

    def plaque_imputation(self, df: pd.DataFrame) -> pd.DataFrame:
        """Imputes values for Plaque without affecting other columns.

        Args:
            df (pd.DataFrame): Input DataFrame with a 'plaque' column.

        Returns:
            pd.DataFrame: The DataFrame with the imputed 'plaque' values.
        """
        df.columns = [col.lower() for col in df.columns]

        if "plaque" not in df.columns:
            raise KeyError("'plaque' column not found in the DataFrame")
        df["plaque"] = pd.to_numeric(df["plaque"], errors="coerce")

        conditions_baseline = [
            df["pdbaseline"] <= 3,
            (df["pdbaseline"] >= 4) & (df["pdbaseline"] <= 5),
            df["pdbaseline"] >= 6,
        ]
        choices_baseline = [0, 1, 2]
        df["pdbaseline_grouped"] = np.select(
            conditions_baseline, choices_baseline, default=-1
        )

        patients_with_all_nas = df.groupby(self.group_col)["plaque"].apply(
            lambda x: all(pd.isna(x))
        )
        df["plaque_all_na"] = df[self.group_col].isin(
            patients_with_all_nas[patients_with_all_nas].index
        )

        grouped_data = df.groupby(["tooth", "side", "pdbaseline_grouped"])

        modes_dict = {}
        for (tooth, side, baseline_grouped), group in grouped_data:
            modes = group["plaque"].mode()
            mode_value = modes.iloc[0] if not modes.empty else None
            modes_dict[(tooth, side, baseline_grouped)] = mode_value

        df["plaque"] = df.apply(
            lambda row: self._plaque_values(row=row, modes_dict=modes_dict), axis=1
        )

        df = df.drop(["pdbaseline_grouped", "plaque_all_na"], axis=1)

        return df

    def _fur_side(self, nr: int) -> Union[np.ndarray, str]:
        """Returns the sides for the input tooth that should have furcations.

        Args:
            nr (int): Tooth number.

        Returns:
            Union[np.ndarray, str]: Sides with furcations, or 'without Furkation'
            if not applicable.
        """
        for key, value in self.sides_with_fur.items():
            if nr in key:
                return np.array(value)
        return "Tooth without Furkation"

    def _fur_values(self, row: pd.Series) -> int:
        """Calculate values for the FurcationBaseline column.

        Args:
            row (pd.Series): A row from the DataFrame.

        Returns:
            int: Imputed value for furcationbaseline.
        """
        tooth_fur = [14, 16, 17, 18, 24, 26, 27, 28, 36, 37, 38, 46, 47, 48]
        if pd.isna(row["pdbaseline"]) or pd.isna(row["recbaseline"]):
            raise ValueError(
                "NaN found in pdbaseline or recbaseline. Check RecBaseline imputation."
            )

        if row["furcationbaseline_all_na"] == 1:
            if row["tooth"] in tooth_fur:
                if row["side"] in self._fur_side(nr=row["tooth"]):
                    if (row["pdbaseline"] + row["recbaseline"]) < 4:
                        return 0
                    elif 3 < (row["pdbaseline"] + row["recbaseline"]) < 6:
                        return 1
                    else:
                        return 2
                else:
                    return 0
            else:
                return 0
        else:
            if pd.isna(row["furcationbaseline"]):
                return 0
            else:
                return row["furcationbaseline"]

    def fur_imputation(self, df: pd.DataFrame) -> pd.DataFrame:
        """Impute the values in the FurcationBaseline column.

        Args:
            df (pd.DataFrame): Input DataFrame.

        Returns:
            pd.DataFrame: The DataFrame with imputed values for 'furcationbaseline'.
        """
        if "furcationbaseline" not in df.columns:
            raise KeyError("'furcationbaseline' column not found in the DataFrame")

        patients_with_all_nas = df.groupby(self.group_col)["furcationbaseline"].apply(
            lambda x: all(pd.isna(x))
        )
        df["furcationbaseline_all_na"] = df[self.group_col].isin(
            patients_with_all_nas[patients_with_all_nas].index
        )

        df["furcationbaseline"] = df.apply(self._fur_values, axis=1)
        df = df.drop(["furcationbaseline_all_na"], axis=1)

        return df

__init__()

Initialize Preprocessor with helper data without storing the DataFrame.

Source code in periomod/data/_helpers.py
100
101
102
103
104
def __init__(self):
    """Initialize Preprocessor with helper data without storing the DataFrame."""
    super().__init__()
    self.teeth_neighbors = _get_teeth_neighbors()
    self.sides_with_fur = _get_side()

check_infection(depth, boprevaluation) staticmethod

Check if a given tooth side is infected.

Parameters:

Name Type Description Default
depth int

the depth of the pocket before the therapy

required
boprevaluation int

the value of BOP evaluation for the tooth side

required

Returns:

Type Description
int

1 if the tooth side is infected, otherwise 0.

Source code in periomod/data/_helpers.py
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
@staticmethod
def check_infection(depth: int, boprevaluation: int) -> int:
    """Check if a given tooth side is infected.

    Args:
        depth: the depth of the pocket before the therapy
        boprevaluation: the value of BOP evaluation for the tooth side

    Returns:
        1 if the tooth side is infected, otherwise 0.
    """
    if depth > 4:
        return 1
    elif depth == 4 and boprevaluation == 2:
        return 1
    return 0

fur_imputation(df)

Impute the values in the FurcationBaseline column.

Parameters:

Name Type Description Default
df DataFrame

Input DataFrame.

required

Returns:

Type Description
DataFrame

pd.DataFrame: The DataFrame with imputed values for 'furcationbaseline'.

Source code in periomod/data/_helpers.py
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
def fur_imputation(self, df: pd.DataFrame) -> pd.DataFrame:
    """Impute the values in the FurcationBaseline column.

    Args:
        df (pd.DataFrame): Input DataFrame.

    Returns:
        pd.DataFrame: The DataFrame with imputed values for 'furcationbaseline'.
    """
    if "furcationbaseline" not in df.columns:
        raise KeyError("'furcationbaseline' column not found in the DataFrame")

    patients_with_all_nas = df.groupby(self.group_col)["furcationbaseline"].apply(
        lambda x: all(pd.isna(x))
    )
    df["furcationbaseline_all_na"] = df[self.group_col].isin(
        patients_with_all_nas[patients_with_all_nas].index
    )

    df["furcationbaseline"] = df.apply(self._fur_values, axis=1)
    df = df.drop(["furcationbaseline_all_na"], axis=1)

    return df

get_adjacent_infected_teeth_count(df, patient_col, tooth_col, infection_col)

Adds a new column indicating the number of adjacent infected teeth.

Parameters:

Name Type Description Default
df DataFrame

Dataset to process.

required
patient_col str

Name of column containing ID for patients.

required
tooth_col str

Name of column containing teeth represented in numbers.

required
infection_col str

Name of column indicating whether a tooth is healthy.

required

Returns:

Type Description
DataFrame

pd.DataFrame: Modified dataset with new column 'infected_neighbors'.

Source code in periomod/data/_helpers.py
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
def get_adjacent_infected_teeth_count(
    self, df: pd.DataFrame, patient_col: str, tooth_col: str, infection_col: str
) -> pd.DataFrame:
    """Adds a new column indicating the number of adjacent infected teeth.

    Args:
        df (pd.DataFrame): Dataset to process.
        patient_col (str): Name of column containing ID for patients.
        tooth_col (str): Name of column containing teeth represented in numbers.
        infection_col (str): Name of column indicating whether a tooth is healthy.

    Returns:
        pd.DataFrame: Modified dataset with new column 'infected_neighbors'.
    """
    for patient_id, patient_data in df.groupby(patient_col):
        infected_teeth = set(
            patient_data[patient_data[infection_col] == 1][tooth_col]
        )

        df.loc[df[patient_col] == patient_id, "infected_neighbors"] = patient_data[
            tooth_col
        ].apply(
            lambda tooth, infected_teeth=infected_teeth: sum(
                1
                for neighbor in self._tooth_neighbor(nr=tooth)
                if neighbor in infected_teeth
            )
        )

    return df

plaque_imputation(df)

Imputes values for Plaque without affecting other columns.

Parameters:

Name Type Description Default
df DataFrame

Input DataFrame with a 'plaque' column.

required

Returns:

Type Description
DataFrame

pd.DataFrame: The DataFrame with the imputed 'plaque' values.

Source code in periomod/data/_helpers.py
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
def plaque_imputation(self, df: pd.DataFrame) -> pd.DataFrame:
    """Imputes values for Plaque without affecting other columns.

    Args:
        df (pd.DataFrame): Input DataFrame with a 'plaque' column.

    Returns:
        pd.DataFrame: The DataFrame with the imputed 'plaque' values.
    """
    df.columns = [col.lower() for col in df.columns]

    if "plaque" not in df.columns:
        raise KeyError("'plaque' column not found in the DataFrame")
    df["plaque"] = pd.to_numeric(df["plaque"], errors="coerce")

    conditions_baseline = [
        df["pdbaseline"] <= 3,
        (df["pdbaseline"] >= 4) & (df["pdbaseline"] <= 5),
        df["pdbaseline"] >= 6,
    ]
    choices_baseline = [0, 1, 2]
    df["pdbaseline_grouped"] = np.select(
        conditions_baseline, choices_baseline, default=-1
    )

    patients_with_all_nas = df.groupby(self.group_col)["plaque"].apply(
        lambda x: all(pd.isna(x))
    )
    df["plaque_all_na"] = df[self.group_col].isin(
        patients_with_all_nas[patients_with_all_nas].index
    )

    grouped_data = df.groupby(["tooth", "side", "pdbaseline_grouped"])

    modes_dict = {}
    for (tooth, side, baseline_grouped), group in grouped_data:
        modes = group["plaque"].mode()
        mode_value = modes.iloc[0] if not modes.empty else None
        modes_dict[(tooth, side, baseline_grouped)] = mode_value

    df["plaque"] = df.apply(
        lambda row: self._plaque_values(row=row, modes_dict=modes_dict), axis=1
    )

    df = df.drop(["pdbaseline_grouped", "plaque_all_na"], axis=1)

    return df