Tabular Classification
Scikit-learn
English
hierarchical
healthcare
ehr
copd
clinical-risk
tabular
scikit-learn
clustering
unsupervised
Instructions to use stormid/copd-model-e with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Scikit-learn
How to use stormid/copd-model-e with Scikit-learn:
from huggingface_hub import hf_hub_download import joblib model = joblib.load( hf_hub_download("stormid/copd-model-e", "sklearn_model.joblib") ) # only load pickle files from sources you trust # read more about it here https://skops.readthedocs.io/en/stable/persistence.html - Notebooks
- Google Colab
- Kaggle
| """ | |
| Script to remove all receiver IDs from relevant data sources. | |
| """ | |
| import json | |
| import pandas as pd | |
| from sklearn.model_selection import train_test_split | |
| def get_ids(path): | |
| """ | |
| Read in IDs | |
| -------- | |
| :return: list of SafeHavenIDs | |
| """ | |
| print('Loading IDs from ' + path) | |
| df = pd.read_csv(path, encoding="cp1252") | |
| ids = df['SafeHavenID'].tolist() | |
| return ids | |
| def save_rec_sup(df, data_path, rec_ids, sup_ids): | |
| """ | |
| Remove receiver IDs from dataframe and pickle the dataset | |
| -------- | |
| :param df: pandas dataframe to remove ids from | |
| :param data_path: path to generated data | |
| :param rec_ids: list of SafeHavenIDs in receiver cohort to remove | |
| :param sup_ids: list of SafeHavenIDs in scale-up cohort to remove | |
| :return: None | |
| """ | |
| print('Saving REC and SUP data') | |
| # Remove receiver IDs | |
| df_rec = df[df['SafeHavenID'].isin(rec_ids)] | |
| df_sup = df[df['SafeHavenID'].isin(sup_ids)] | |
| df = df[~df['SafeHavenID'].isin(rec_ids + sup_ids)] | |
| # Save data | |
| df_rec.to_pickle(data_path + 'merged_rec.pkl') | |
| df_sup.to_pickle(data_path + 'merged_sup.pkl') | |
| return df | |
| def save_df_ids(df, data_path, ids, typ): | |
| """ | |
| Save train, test or validation ids and corresponding data | |
| -------- | |
| :param df: dataframe | |
| :param data_path: path to generated data | |
| :param ids: list of SafeHavenIDs | |
| :param typ: type of dataset to create, 'train', 'test', 'val' | |
| """ | |
| print('Saving ' + typ + ' data') | |
| df_ids = pd.DataFrame(ids, columns=['SafeHavenID']) | |
| df_ids.to_pickle(data_path + typ + '_ids.pkl') | |
| df_ids_data = df[df['SafeHavenID'].isin(ids)] | |
| df_ids_data.to_pickle(data_path + 'merged_' + typ + '.pkl') | |
| def df_tts(df, data_path): | |
| """ | |
| Split data into training and testing sets and save dataframes | |
| -------- | |
| :param df: pandas dataframe to split | |
| :param data_path: path to generated data | |
| :return: None | |
| """ | |
| # Split IDs into training, testing and validation sets | |
| ids = df['SafeHavenID'].tolist() | |
| train_ids, test_ids = train_test_split( | |
| ids, test_size=0.2, random_state=42) | |
| train_ids, val_ids = train_test_split( | |
| train_ids, test_size=0.25, random_state=42) | |
| # Save IDs and datasets | |
| save_df_ids(df, data_path, train_ids, 'train') | |
| save_df_ids(df, data_path, test_ids, 'test') | |
| save_df_ids(df, data_path, val_ids, 'val') | |
| def main(): | |
| # Load in config items | |
| with open('../../../config.json') as json_config_file: | |
| config = json.load(json_config_file) | |
| # Set paths | |
| data_path = config['model_data_path'] | |
| rec_path = config['rec_data_path'] + 'Cohort3Rand.csv' | |
| sup_path = config['sup_data_path'] + 'Scale_Up_lookup.csv' | |
| # Get IDs to exclude | |
| rec_ids = get_ids(rec_path) | |
| sup_ids = get_ids(sup_path) | |
| # Remove IDs from datasets | |
| df = pd.read_pickle(data_path + 'merged.pkl') | |
| df = save_rec_sup(df, data_path, rec_ids, sup_ids) | |
| # Split and save the data | |
| df_tts(df, data_path) | |
| main() | |