Skip to content

ProcessDataHelper

Bases: BaseConfig

Helper class for processing periodontal data with utility methods.

This class provides methods for evaluating tooth infection status, calculating adjacent infected teeth, and imputing values for 'plaque' and 'furcationbaseline' columns based on predefined rules and conditions.

Inherits
  • BaseConfig: Provides configuration settings for data processing.

Attributes:

Name Type Description
teeth_neighbors dict

Dictionary mapping each tooth to its adjacent neighbors.

sides_with_fur dict

Dictionary specifying teeth with furcations and their respective sides.

Methods:

Name Description
check_infection

Evaluates infection status based on pocket depth and BOP values.

get_adjacent_infected_teeth_count

Adds a column to indicate the count of adjacent infected teeth for each tooth.

plaque_imputation

Imputes values in the 'plaque' column.

fur_imputation

Imputes values in the 'furcationbaseline' column.

Example
helper = ProcessDataHelper()
data = helper.plaque_imputation(data)
data = helper.fur_imputation(data)
infected_count_data = helper.get_adjacent_infected_teeth_count(
    data, patient_col="id_patient", tooth_col="tooth",
    infection_col="infection"
)
Source code in periomod/data/_helpers.py
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
class ProcessDataHelper(BaseConfig):
    """Helper class for processing periodontal data with utility methods.

    This class provides methods for evaluating tooth infection status,
    calculating adjacent infected teeth, and imputing values for 'plaque' and
    'furcationbaseline' columns based on predefined rules and conditions.

    Inherits:
        - `BaseConfig`: Provides configuration settings for data processing.

    Attributes:
        teeth_neighbors (dict): Dictionary mapping each tooth to its adjacent
            neighbors.
        sides_with_fur (dict): Dictionary specifying teeth with furcations and
            their respective sides.

    Methods:
        check_infection: Evaluates infection status based on pocket depth and
            BOP values.
        get_adjacent_infected_teeth_count: Adds a column to indicate the count
            of adjacent infected teeth for each tooth.
        plaque_imputation: Imputes values in the 'plaque' column.
        fur_imputation: Imputes values in the 'furcationbaseline' column.

    Example:
        ```
        helper = ProcessDataHelper()
        data = helper.plaque_imputation(data)
        data = helper.fur_imputation(data)
        infected_count_data = helper.get_adjacent_infected_teeth_count(
            data, patient_col="id_patient", tooth_col="tooth",
            infection_col="infection"
        )
        ```
    """

    def __init__(self):
        """Initialize Preprocessor with helper data without storing the DataFrame."""
        super().__init__()
        self.teeth_neighbors = _get_teeth_neighbors()
        self.sides_with_fur = _get_side()

    @staticmethod
    def check_infection(depth: int, boprevaluation: int) -> int:
        """Check if a given tooth side is infected.

        Args:
            depth: the depth of the pocket before the therapy
            boprevaluation: the value of BOP evaluation for the tooth side

        Returns:
            1 if the tooth side is infected, otherwise 0.
        """
        if depth > 4:
            return 1
        elif depth == 4 and boprevaluation == 2:
            return 1
        return 0

    def _tooth_neighbor(self, nr: int) -> Union[np.ndarray, str]:
        """Returns adjacent teeth for a given tooth.

        Args:
            nr (int): tooth number (11-48)

        Returns:
            Union[np.ndarray, str]: Array of adjacent teeth, or 'No tooth'
            if input is invalid.
        """
        return np.array(self.teeth_neighbors.get(nr, "No tooth"))

    def get_adjacent_infected_teeth_count(
        self, data: pd.DataFrame, patient_col: str, tooth_col: str, infection_col: str
    ) -> pd.DataFrame:
        """Adds a new column indicating the number of adjacent infected teeth.

        Args:
            data (pd.DataFrame): Dataset to process.
            patient_col (str): Name of column containing ID for patients.
            tooth_col (str): Name of column containing teeth represented in numbers.
            infection_col (str): Name of column indicating whether a tooth is healthy.

        Returns:
            pd.DataFrame: Modified dataset with new column 'infected_neighbors'.
        """
        for patient_id, patient_data in data.groupby(patient_col):
            infected_teeth = set(
                patient_data[patient_data[infection_col] == 1][tooth_col]
            )

            data.loc[data[patient_col] == patient_id, "infected_neighbors"] = (
                patient_data[tooth_col].apply(
                    lambda tooth, infected_teeth=infected_teeth: sum(
                        1
                        for neighbor in self._tooth_neighbor(nr=tooth)
                        if neighbor in infected_teeth
                    )
                )
            )

        return data

    @staticmethod
    def _plaque_values(row: pd.Series, modes_dict: dict) -> int:
        """Calculate new values for the Plaque column.

        Args:
            row (pd.Series): A row from the DataFrame.
            modes_dict (dict): Dict mapping (tooth, side, pdbaseline_grouped)
            to the mode plaque value.

        Returns:
            int: Imputed plaque value for the given row.
        """
        if row["plaque_all_na"] == 1:
            key = (row["tooth"], row["side"], row["pdbaseline_grouped"])
            mode_value = modes_dict.get(key, None)
            if mode_value is not None:
                if isinstance(mode_value, tuple) and 2 in mode_value:
                    return 2
                elif mode_value == 1:
                    return 1
                elif mode_value == 2:
                    return 2
        else:
            if pd.isna(row["plaque"]):
                return 1
            else:
                return row["plaque"]
        return 1

    def plaque_imputation(self, data: pd.DataFrame) -> pd.DataFrame:
        """Imputes values for Plaque without affecting other columns.

        Args:
            data (pd.DataFrame): Input DataFrame with a 'plaque' column.

        Returns:
            pd.DataFrame: The DataFrame with the imputed 'plaque' values.

        Raises:
            KeyError: If plaque column is not found in DataFrame.
        """
        data.columns = [col.lower() for col in data.columns]

        if "plaque" not in data.columns:
            raise KeyError("'plaque' column not found in the DataFrame")
        data["plaque"] = pd.to_numeric(data["plaque"], errors="coerce")

        conditions_baseline = [
            data["pdbaseline"] <= 3,
            (data["pdbaseline"] >= 4) & (data["pdbaseline"] <= 5),
            data["pdbaseline"] >= 6,
        ]
        choices_baseline = [0, 1, 2]
        data["pdbaseline_grouped"] = np.select(
            conditions_baseline, choices_baseline, default=-1
        )

        patients_with_all_nas = data.groupby(self.group_col)["plaque"].apply(
            lambda x: all(pd.isna(x))
        )
        data["plaque_all_na"] = data[self.group_col].isin(
            patients_with_all_nas[patients_with_all_nas].index
        )

        grouped_data = data.groupby(["tooth", "side", "pdbaseline_grouped"])

        modes_dict = {}
        for (tooth, side, baseline_grouped), group in grouped_data:
            modes = group["plaque"].mode()
            mode_value = modes.iloc[0] if not modes.empty else None
            modes_dict[(tooth, side, baseline_grouped)] = mode_value

        data["plaque"] = data.apply(
            lambda row: self._plaque_values(row=row, modes_dict=modes_dict), axis=1
        )

        data = data.drop(["pdbaseline_grouped", "plaque_all_na"], axis=1)

        return data

    def _fur_side(self, nr: int) -> Union[np.ndarray, str]:
        """Returns the sides for the input tooth that should have furcations.

        Args:
            nr (int): Tooth number.

        Returns:
            Union[np.ndarray, str]: Sides with furcations, or 'without Furkation'
            if not applicable.
        """
        for key, value in self.sides_with_fur.items():
            if nr in key:
                return np.array(value)
        return "Tooth without Furkation"

    def _fur_values(self, row: pd.Series) -> int:
        """Calculate values for the FurcationBaseline column.

        Args:
            row (pd.Series): A row from the DataFrame.

        Returns:
            int: Imputed value for furcationbaseline.

        Raises:
            ValueError: If NaN is found in pd- or recbaseline.
        """
        tooth_fur = [14, 16, 17, 18, 24, 26, 27, 28, 36, 37, 38, 46, 47, 48]
        if pd.isna(row["pdbaseline"]) or pd.isna(row["recbaseline"]):
            raise ValueError(
                "NaN found in pdbaseline or recbaseline. Check RecBaseline imputation."
            )

        if row["furcationbaseline_all_na"] == 1:
            if row["tooth"] in tooth_fur:
                if row["side"] in self._fur_side(nr=row["tooth"]):
                    if (row["pdbaseline"] + row["recbaseline"]) < 4:
                        return 0
                    elif 3 < (row["pdbaseline"] + row["recbaseline"]) < 6:
                        return 1
                    else:
                        return 2
                else:
                    return 0
            else:
                return 0
        else:
            if pd.isna(row["furcationbaseline"]):
                return 0
            else:
                return row["furcationbaseline"]

    def fur_imputation(self, data: pd.DataFrame) -> pd.DataFrame:
        """Impute the values in the FurcationBaseline column.

        Args:
            data (pd.DataFrame): Input DataFrame.

        Returns:
            pd.DataFrame: The DataFrame with imputed values for 'furcationbaseline'.

        Raises:
            KeyError: If furcationbaseline is not found in DatFrame.
        """
        if "furcationbaseline" not in data.columns:
            raise KeyError("'furcationbaseline' column not found in the DataFrame")

        patients_with_all_nas = data.groupby(self.group_col)["furcationbaseline"].apply(
            lambda x: all(pd.isna(x))
        )
        data["furcationbaseline_all_na"] = data[self.group_col].isin(
            patients_with_all_nas[patients_with_all_nas].index
        )

        data["furcationbaseline"] = data.apply(self._fur_values, axis=1)
        data = data.drop(["furcationbaseline_all_na"], axis=1)

        return data

__init__()

Initialize Preprocessor with helper data without storing the DataFrame.

Source code in periomod/data/_helpers.py
202
203
204
205
206
def __init__(self):
    """Initialize Preprocessor with helper data without storing the DataFrame."""
    super().__init__()
    self.teeth_neighbors = _get_teeth_neighbors()
    self.sides_with_fur = _get_side()

check_infection(depth, boprevaluation) staticmethod

Check if a given tooth side is infected.

Parameters:

Name Type Description Default
depth int

the depth of the pocket before the therapy

required
boprevaluation int

the value of BOP evaluation for the tooth side

required

Returns:

Type Description
int

1 if the tooth side is infected, otherwise 0.

Source code in periomod/data/_helpers.py
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
@staticmethod
def check_infection(depth: int, boprevaluation: int) -> int:
    """Check if a given tooth side is infected.

    Args:
        depth: the depth of the pocket before the therapy
        boprevaluation: the value of BOP evaluation for the tooth side

    Returns:
        1 if the tooth side is infected, otherwise 0.
    """
    if depth > 4:
        return 1
    elif depth == 4 and boprevaluation == 2:
        return 1
    return 0

fur_imputation(data)

Impute the values in the FurcationBaseline column.

Parameters:

Name Type Description Default
data DataFrame

Input DataFrame.

required

Returns:

Type Description
DataFrame

pd.DataFrame: The DataFrame with imputed values for 'furcationbaseline'.

Raises:

Type Description
KeyError

If furcationbaseline is not found in DatFrame.

Source code in periomod/data/_helpers.py
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
def fur_imputation(self, data: pd.DataFrame) -> pd.DataFrame:
    """Impute the values in the FurcationBaseline column.

    Args:
        data (pd.DataFrame): Input DataFrame.

    Returns:
        pd.DataFrame: The DataFrame with imputed values for 'furcationbaseline'.

    Raises:
        KeyError: If furcationbaseline is not found in DatFrame.
    """
    if "furcationbaseline" not in data.columns:
        raise KeyError("'furcationbaseline' column not found in the DataFrame")

    patients_with_all_nas = data.groupby(self.group_col)["furcationbaseline"].apply(
        lambda x: all(pd.isna(x))
    )
    data["furcationbaseline_all_na"] = data[self.group_col].isin(
        patients_with_all_nas[patients_with_all_nas].index
    )

    data["furcationbaseline"] = data.apply(self._fur_values, axis=1)
    data = data.drop(["furcationbaseline_all_na"], axis=1)

    return data

get_adjacent_infected_teeth_count(data, patient_col, tooth_col, infection_col)

Adds a new column indicating the number of adjacent infected teeth.

Parameters:

Name Type Description Default
data DataFrame

Dataset to process.

required
patient_col str

Name of column containing ID for patients.

required
tooth_col str

Name of column containing teeth represented in numbers.

required
infection_col str

Name of column indicating whether a tooth is healthy.

required

Returns:

Type Description
DataFrame

pd.DataFrame: Modified dataset with new column 'infected_neighbors'.

Source code in periomod/data/_helpers.py
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
def get_adjacent_infected_teeth_count(
    self, data: pd.DataFrame, patient_col: str, tooth_col: str, infection_col: str
) -> pd.DataFrame:
    """Adds a new column indicating the number of adjacent infected teeth.

    Args:
        data (pd.DataFrame): Dataset to process.
        patient_col (str): Name of column containing ID for patients.
        tooth_col (str): Name of column containing teeth represented in numbers.
        infection_col (str): Name of column indicating whether a tooth is healthy.

    Returns:
        pd.DataFrame: Modified dataset with new column 'infected_neighbors'.
    """
    for patient_id, patient_data in data.groupby(patient_col):
        infected_teeth = set(
            patient_data[patient_data[infection_col] == 1][tooth_col]
        )

        data.loc[data[patient_col] == patient_id, "infected_neighbors"] = (
            patient_data[tooth_col].apply(
                lambda tooth, infected_teeth=infected_teeth: sum(
                    1
                    for neighbor in self._tooth_neighbor(nr=tooth)
                    if neighbor in infected_teeth
                )
            )
        )

    return data

plaque_imputation(data)

Imputes values for Plaque without affecting other columns.

Parameters:

Name Type Description Default
data DataFrame

Input DataFrame with a 'plaque' column.

required

Returns:

Type Description
DataFrame

pd.DataFrame: The DataFrame with the imputed 'plaque' values.

Raises:

Type Description
KeyError

If plaque column is not found in DataFrame.

Source code in periomod/data/_helpers.py
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
def plaque_imputation(self, data: pd.DataFrame) -> pd.DataFrame:
    """Imputes values for Plaque without affecting other columns.

    Args:
        data (pd.DataFrame): Input DataFrame with a 'plaque' column.

    Returns:
        pd.DataFrame: The DataFrame with the imputed 'plaque' values.

    Raises:
        KeyError: If plaque column is not found in DataFrame.
    """
    data.columns = [col.lower() for col in data.columns]

    if "plaque" not in data.columns:
        raise KeyError("'plaque' column not found in the DataFrame")
    data["plaque"] = pd.to_numeric(data["plaque"], errors="coerce")

    conditions_baseline = [
        data["pdbaseline"] <= 3,
        (data["pdbaseline"] >= 4) & (data["pdbaseline"] <= 5),
        data["pdbaseline"] >= 6,
    ]
    choices_baseline = [0, 1, 2]
    data["pdbaseline_grouped"] = np.select(
        conditions_baseline, choices_baseline, default=-1
    )

    patients_with_all_nas = data.groupby(self.group_col)["plaque"].apply(
        lambda x: all(pd.isna(x))
    )
    data["plaque_all_na"] = data[self.group_col].isin(
        patients_with_all_nas[patients_with_all_nas].index
    )

    grouped_data = data.groupby(["tooth", "side", "pdbaseline_grouped"])

    modes_dict = {}
    for (tooth, side, baseline_grouped), group in grouped_data:
        modes = group["plaque"].mode()
        mode_value = modes.iloc[0] if not modes.empty else None
        modes_dict[(tooth, side, baseline_grouped)] = mode_value

    data["plaque"] = data.apply(
        lambda row: self._plaque_values(row=row, modes_dict=modes_dict), axis=1
    )

    data = data.drop(["pdbaseline_grouped", "plaque_all_na"], axis=1)

    return data