Instructions to use LLouis0622/early_warning_model with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Scikit-learn
How to use LLouis0622/early_warning_model with Scikit-learn:
from huggingface_hub import hf_hub_download import joblib model = joblib.load( hf_hub_download("LLouis0622/early_warning_model", "sklearn_model.joblib") ) # only load pickle files from sources you trust # read more about it here https://skops.readthedocs.io/en/stable/persistence.html - Notebooks
- Google Colab
- Kaggle
| import pandas as pd | |
| import numpy as np | |
| import pickle | |
| import json | |
| import argparse | |
| from pathlib import Path | |
| import sys | |
| from feature_engineering import FeatureEngineer | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.preprocessing import LabelEncoder | |
| from sklearn.metrics import (accuracy_score, precision_score, recall_score, | |
| f1_score, roc_auc_score, confusion_matrix) | |
| import xgboost as xgb | |
| import lightgbm as lgb | |
| from imblearn.over_sampling import SMOTE | |
| def load_data(data_dir): | |
| """๋ฐ์ดํฐ ๋ก๋""" | |
| print("๋ฐ์ดํฐ ๋ก๋ ์ค...") | |
| df_store = pd.read_csv(f'{data_dir}/big_data_set1_f.csv', | |
| encoding='cp949', on_bad_lines='skip') | |
| df_usage = pd.read_csv(f'{data_dir}/ds2_monthly_usage.csv', | |
| encoding='cp949', on_bad_lines='skip') | |
| df_customer = pd.read_csv(f'{data_dir}/ds3_monthly_customers.csv', | |
| encoding='cp949', on_bad_lines='skip') | |
| print(f"๋งค์ฅ ์ ๋ณด: {df_store.shape}") | |
| print(f"์ด์ฉ ๋ฐ์ดํฐ: {df_usage.shape}") | |
| print(f"๊ณ ๊ฐ ๋ฐ์ดํฐ: {df_customer.shape}") | |
| return df_store, df_usage, df_customer | |
| def create_features(df_store, df_usage, df_customer, max_stores=None): | |
| """ํน์ง ์์ฑ""" | |
| print("\nํน์ง ์์ฑ ์ค...") | |
| engineer = FeatureEngineer(include_weather=False) | |
| all_features = [] | |
| all_targets = [] | |
| store_ids = df_store['ENCODED_MCT'].unique() | |
| if max_stores: | |
| store_ids = store_ids[:max_stores] | |
| for idx, store_id in enumerate(store_ids): | |
| store_info = df_store[df_store['ENCODED_MCT'] == store_id].iloc[0] | |
| usage_data = df_usage[df_usage['ENCODED_MCT'] == store_id] | |
| customer_data = df_customer[df_customer['ENCODED_MCT'] == store_id] | |
| # ์ต์ 3๊ฐ์ ๋ฐ์ดํฐ ํ์ | |
| if len(usage_data) >= 3: | |
| store_data = { | |
| 'industry': store_info['HPSN_MCT_BZN_CD_NM'] if pd.notna(store_info['HPSN_MCT_BZN_CD_NM']) else '๊ธฐํ', | |
| 'location': store_info['MCT_SIGUNGU_NM'] | |
| } | |
| features = engineer.create_features(store_data, usage_data, customer_data) | |
| target = 1 if pd.notna(store_info['MCT_ME_D']) else 0 | |
| all_features.append(features) | |
| all_targets.append(target) | |
| if (idx + 1) % 500 == 0: | |
| print(f" ์ฒ๋ฆฌ ์ค... {idx + 1}/{len(store_ids)}") | |
| X = pd.concat(all_features, ignore_index=True) | |
| y = pd.Series(all_targets) | |
| print(f"์ด ์ํ: {len(X)}, ํน์ง ์: {X.shape[1]}") | |
| print(f"ํ์ ๋น์จ: {y.mean():.2%} ({y.sum()}๊ฐ)") | |
| return X, y | |
| def preprocess_data(X, y): | |
| """๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ""" | |
| print("\n๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ ์ค...") | |
| # ์นดํ ๊ณ ๋ฆฌ ๋ณ์ ์ธ์ฝ๋ฉ | |
| label_encoders = {} | |
| if 'context_industry' in X.columns: | |
| le = LabelEncoder() | |
| X['context_industry'] = le.fit_transform(X['context_industry'].astype(str)) | |
| label_encoders['context_industry'] = le | |
| # ๊ฒฐ์ธก์น ์ฒ๋ฆฌ | |
| X = X.fillna(X.median()) | |
| # ๋ฐ์ดํฐ ๋ถํ | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=0.25, random_state=42, stratify=y | |
| ) | |
| print(f"Train: {X_train.shape}, Test: {X_test.shape}") | |
| print(f"Train ํ์ : {y_train.mean():.2%}, Test ํ์ : {y_test.mean():.2%}") | |
| return X_train, X_test, y_train, y_test, label_encoders | |
| def apply_smote(X_train, y_train): | |
| """SMOTE ์ ์ฉ""" | |
| print("\nํด๋์ค ๋ถ๊ท ํ ์ฒ๋ฆฌ(SMOTE)...") | |
| min_samples = min(y_train.sum(), len(y_train) - y_train.sum()) | |
| k_neighbors = min(5, min_samples - 1) | |
| smote = SMOTE(random_state=42, k_neighbors=k_neighbors) | |
| X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train) | |
| print(f"SMOTE ํ: ์์ {(y_train_balanced == 0).sum()}๊ฐ, ํ์ {(y_train_balanced == 1).sum()}๊ฐ") | |
| return X_train_balanced, y_train_balanced | |
| def train_models(X_train, y_train): | |
| """๋ชจ๋ธ ํ์ต""" | |
| print("\n๋ชจ๋ธ ํ์ต ์ค...") | |
| # XGBoost | |
| print(" - XGBoost ํ์ต...") | |
| xgb_model = xgb.XGBClassifier( | |
| max_depth=6, | |
| learning_rate=0.1, | |
| n_estimators=200, | |
| random_state=42, | |
| eval_metric='logloss' | |
| ) | |
| xgb_model.fit(X_train, y_train) | |
| # LightGBM | |
| print(" - LightGBM ํ์ต...") | |
| lgb_model = lgb.LGBMClassifier( | |
| max_depth=6, | |
| learning_rate=0.1, | |
| n_estimators=200, | |
| random_state=42, | |
| verbose=-1 | |
| ) | |
| lgb_model.fit(X_train, y_train) | |
| print("๋ชจ๋ธ ํ์ต ์๋ฃ") | |
| return xgb_model, lgb_model | |
| def evaluate_models(xgb_model, lgb_model, X_test, y_test): | |
| """๋ชจ๋ธ ํ๊ฐ""" | |
| print("\n๋ชจ๋ธ ํ๊ฐ ์ค...") | |
| # ์์ธก | |
| xgb_pred = xgb_model.predict_proba(X_test)[:, 1] | |
| lgb_pred = lgb_model.predict_proba(X_test)[:, 1] | |
| # ์์๋ธ | |
| ensemble_pred = 0.5 * xgb_pred + 0.5 * lgb_pred | |
| ensemble_pred_binary = (ensemble_pred > 0.5).astype(int) | |
| # ํ๊ฐ ์งํ | |
| accuracy = accuracy_score(y_test, ensemble_pred_binary) | |
| precision = precision_score(y_test, ensemble_pred_binary, zero_division=0) | |
| recall = recall_score(y_test, ensemble_pred_binary, zero_division=0) | |
| f1 = f1_score(y_test, ensemble_pred_binary, zero_division=0) | |
| auc = roc_auc_score(y_test, ensemble_pred) | |
| print("\n" + "=" * 70) | |
| print("๋ชจ๋ธ ์ฑ๋ฅ (Test Set)") | |
| print("=" * 70) | |
| print(f"Accuracy: {accuracy:.4f} ({accuracy * 100:.1f}%)") | |
| print(f"Precision: {precision:.4f} ({precision * 100:.1f}%)") | |
| print(f"Recall: {recall:.4f} ({recall * 100:.1f}%)") | |
| print(f"F1-Score: {f1:.4f}") | |
| print(f"AUC-ROC: {auc:.4f}") | |
| print("=" * 70) | |
| # ํผ๋ ํ๋ ฌ | |
| cm = confusion_matrix(y_test, ensemble_pred_binary) | |
| print(f"\nํผ๋ ํ๋ ฌ:") | |
| print(f" TN: {cm[0, 0]}, FP: {cm[0, 1]}") | |
| print(f" FN: {cm[1, 0]}, TP: {cm[1, 1]}") | |
| return { | |
| 'accuracy': float(accuracy), | |
| 'precision': float(precision), | |
| 'recall': float(recall), | |
| 'f1_score': float(f1), | |
| 'auc_roc': float(auc) | |
| } | |
| def save_models(xgb_model, lgb_model, X, label_encoders, performance, output_dir): | |
| """๋ชจ๋ธ ์ ์ฅ""" | |
| print(f"\n๋ชจ๋ธ ์ ์ฅ ์ค... ({output_dir})") | |
| output_path = Path(output_dir) | |
| output_path.mkdir(parents=True, exist_ok=True) | |
| # ๋ชจ๋ธ ์ ์ฅ | |
| with open(output_path / 'xgboost_model.pkl', 'wb') as f: | |
| pickle.dump(xgb_model, f) | |
| with open(output_path / 'lightgbm_model.pkl', 'wb') as f: | |
| pickle.dump(lgb_model, f) | |
| with open(output_path / 'label_encoders.pkl', 'wb') as f: | |
| pickle.dump(label_encoders, f) | |
| # ํน์ง ์ด๋ฆ ์ ์ฅ | |
| feature_names = list(X.columns) | |
| with open(output_path / 'feature_names.json', 'w', encoding='utf-8') as f: | |
| json.dump(feature_names, f, ensure_ascii=False, indent=2) | |
| # ์ค์ ์ ์ฅ | |
| config = { | |
| 'model_version': '2.0', | |
| 'ensemble_weights': [0.5, 0.5], | |
| 'threshold': 0.5, | |
| 'n_features': len(feature_names), | |
| 'performance': performance | |
| } | |
| with open(output_path / 'config.json', 'w', encoding='utf-8') as f: | |
| json.dump(config, f, ensure_ascii=False, indent=2) | |
| print("๋ชจ๋ธ ์ ์ฅ ์๋ฃ") | |
| print(f" - {output_path / 'xgboost_model.pkl'}") | |
| print(f" - {output_path / 'lightgbm_model.pkl'}") | |
| print(f" - {output_path / 'config.json'}") | |
| def main(): | |
| parser = argparse.ArgumentParser(description='์์์ ์กฐ๊ธฐ๊ฒฝ๋ณด ๋ชจ๋ธ ํ์ต') | |
| parser.add_argument('--data', type=str, default='data/raw', | |
| help='๋ฐ์ดํฐ ๋๋ ํ ๋ฆฌ ๊ฒฝ๋ก') | |
| parser.add_argument('--output', type=str, default='models', | |
| help='๋ชจ๋ธ ์ ์ฅ ๊ฒฝ๋ก') | |
| parser.add_argument('--max-stores', type=int, default=None, | |
| help='์ต๋ ๋งค์ฅ ์ (ํ ์คํธ์ฉ)') | |
| args = parser.parse_args() | |
| print("=" * 70) | |
| print("์์์ ์กฐ๊ธฐ๊ฒฝ๋ณด ๋ชจ๋ธ v2.0 ํ์ต") | |
| print("=" * 70) | |
| # 1. ๋ฐ์ดํฐ ๋ก๋ | |
| df_store, df_usage, df_customer = load_data(args.data) | |
| # 2. ํน์ง ์์ฑ | |
| X, y = create_features(df_store, df_usage, df_customer, args.max_stores) | |
| # 3. ์ ์ฒ๋ฆฌ | |
| X_train, X_test, y_train, y_test, label_encoders = preprocess_data(X, y) | |
| # 4. SMOTE | |
| X_train_balanced, y_train_balanced = apply_smote(X_train, y_train) | |
| # 5. ๋ชจ๋ธ ํ์ต | |
| xgb_model, lgb_model = train_models(X_train_balanced, y_train_balanced) | |
| # 6. ํ๊ฐ | |
| performance = evaluate_models(xgb_model, lgb_model, X_test, y_test) | |
| # 7. ์ ์ฅ | |
| save_models(xgb_model, lgb_model, X, label_encoders, performance, args.output) | |
| print("\n" + "=" * 70) | |
| print("ํ์ต ์๋ฃ!") | |
| print("=" * 70) | |
| if __name__ == "__main__": | |
| main() | |