Tabular Classification
Scikit-learn
English
hierarchical
healthcare
ehr
copd
clinical-risk
tabular
scikit-learn
clustering
unsupervised
Instructions to use stormid/copd-model-e with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Scikit-learn
How to use stormid/copd-model-e with Scikit-learn:
from huggingface_hub import hf_hub_download import joblib model = joblib.load( hf_hub_download("stormid/copd-model-e", "sklearn_model.joblib") ) # only load pickle files from sources you trust # read more about it here https://skops.readthedocs.io/en/stable/persistence.html - Notebooks
- Google Colab
- Kaggle
| """ | |
| Process SMR01 admission data | |
| -------- | |
| Clean and process admission data while adding tracking for COPD and respiratory | |
| admissions per year for each SafeHavenID | |
| """ | |
| import json | |
| import pandas as pd | |
| from datetime import date | |
| from dateutil.relativedelta import relativedelta | |
| from utils.common import add_hist_adm_presc, first_patient_appearance | |
| from utils.adm_common import (initialize_adm_data, correct_stays, | |
| track_copd_resp) | |
| from utils.adm_processing import (convert_ethgrp_desc, mode_ethnicity, | |
| search_diag) | |
| from utils.adm_reduction import fill_missing_years, calc_adm_per_year | |
| def process_ethnicity(df): | |
| """ | |
| Find relevant ethnic group for each patient, accounting for null data | |
| -------- | |
| :param df: admission dataframe to be updated | |
| :return: admission dataframe with ethnicity cleaned and updated | |
| """ | |
| print('Processing ethnicity') | |
| # Fill in missing ethnicities | |
| df = df.rename(columns={'ETHGRP': 'eth_grp'}) | |
| df['eth_grp'] = df.eth_grp.str.strip() | |
| df['eth_grp'] = df.groupby('SafeHavenID')['eth_grp'].apply( | |
| lambda x: x.ffill().bfill().fillna('Unknown')) | |
| # Convert to 1 of 7 ethnic groups | |
| df['eth_grp'] = [convert_ethgrp_desc(eth) for eth in df.eth_grp] | |
| # Find most commonly occurring ethnicity per SafeHavenID | |
| df = df.groupby('SafeHavenID').apply(mode_ethnicity, 'eth_grp') | |
| return df | |
| def add_eoy_column(df, dt_col, eoy_date): | |
| """ | |
| Add EOY relative to user-specified end date | |
| -------- | |
| :param df: dataframe | |
| :param dt_col: date column in dataframe | |
| :param eoy_date: EOY date from config | |
| :return: updated df with EOY column added | |
| """ | |
| # Needed to stop error with creating a new column | |
| df = df.reset_index(drop=True) | |
| # Add column with user-specified end of year date | |
| end_date = pd.to_datetime(eoy_date) | |
| end_month = end_date.month | |
| end_day = end_date.day | |
| # Add for every year | |
| df['eoy'] = [date(y, end_month, end_day) for y in df[dt_col].dt.year] | |
| # Check that EOY date is after dt_col for each entry | |
| eoy_index = df.columns[df.columns == 'eoy'] | |
| adm_vs_eoy = df[dt_col] > df.eoy | |
| row_index = df.index[adm_vs_eoy] | |
| df.loc[row_index, eoy_index] = df[adm_vs_eoy].eoy + relativedelta(years=1) | |
| df['eoy'] = pd.to_datetime(df.eoy) | |
| return df | |
| def extract_yearly_data(df): | |
| """ | |
| Extract features on a yearly basis for each SafeHavenID | |
| -------- | |
| :param adm: admission dataframe to be updated | |
| :return: dataframe with feature values per year | |
| """ | |
| print('Reducing to 1 row SafeHavenID per year') | |
| # Track rows which are admissions | |
| df['adm'] = 1 | |
| # Add rows from years where patient did not have admissions | |
| df = df.groupby('SafeHavenID').apply(fill_missing_years) | |
| df = df.reset_index(drop=True) | |
| # Add any historical count columns | |
| df = df.groupby('SafeHavenID').apply(add_hist_adm_presc, 'adm', 'ADMDATE') | |
| df = df.reset_index(drop=True) | |
| # Reduce data to 1 row per year | |
| df = calc_adm_per_year(df) | |
| # Select columns in final order | |
| final_cols = ['eth_grp', 'adm_per_year', 'total_hosp_days', | |
| 'mean_los', 'copd_per_year', 'resp_per_year', | |
| 'anxiety_depression_per_year', 'days_since_copd', | |
| 'days_since_resp', 'days_since_adm', 'adm_to_date', | |
| 'copd_to_date', 'resp_to_date', 'anxiety_depression_to_date', | |
| 'copd_date', 'resp_date', 'adm_date'] | |
| df = df[final_cols] | |
| return df | |
| def main(): | |
| # Load in config items | |
| with open('../../../config.json') as json_config_file: | |
| config = json.load(json_config_file) | |
| # Load in data | |
| adm_file = config['extract_data_path'] + 'SMR01_Cohort3R.csv' | |
| adm = initialize_adm_data(adm_file) | |
| # Fill null STAY data and combine transfer admissions | |
| adm = correct_stays(adm) | |
| # Save first date in dataset | |
| data_path = config['model_data_path'] | |
| first_patient_appearance(adm, 'ADMDATE', 'adm', data_path) | |
| # Process ethnicity data | |
| adm = process_ethnicity(adm) | |
| # Track COPD and respiratory events | |
| adm = track_copd_resp(adm) | |
| # Track anxiety event | |
| adm = search_diag(adm, 'anxiety_depression') | |
| # Select relevant columns | |
| reduced_cols = ['SafeHavenID', 'eth_grp', 'ADMDATE', 'STAY', 'copd_event', | |
| 'resp_event', 'anxiety_depression_event'] | |
| adm_reduced = adm[reduced_cols] | |
| # Save per event dataset | |
| adm_reduced.to_pickle(data_path + 'validation_adm_proc.pkl') | |
| # Add column relative to user-specified date | |
| adm_reduced = add_eoy_column(adm_reduced, 'ADMDATE', config['date']) | |
| # Extract yearly data | |
| adm_yearly = extract_yearly_data(adm_reduced) | |
| # Save data | |
| adm_yearly.to_pickle(data_path + 'adm_proc.pkl') | |
| main() | |