Tabular Classification
Scikit-learn
English
random-forest
machine-learning
classification
automl
streamlit
python
scikit-learn
student-project
csv-model
ensemble-learning
desicion-trees
Instructions to use Asma-Abid/Random-Forest with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Scikit-learn
How to use Asma-Abid/Random-Forest with Scikit-learn:
from huggingface_hub import hf_hub_download import joblib model = joblib.load( hf_hub_download("Asma-Abid/Random-Forest", "sklearn_model.joblib") ) # only load pickle files from sources you trust # read more about it here https://skops.readthedocs.io/en/stable/persistence.html - Notebooks
- Google Colab
- Kaggle
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import re | |
| import io | |
| import os | |
| import joblib | |
| import matplotlib | |
| matplotlib.use("Agg") | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from datetime import datetime | |
| from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold | |
| from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler | |
| from sklearn.metrics import ( | |
| accuracy_score, confusion_matrix, silhouette_score, | |
| classification_report, f1_score, precision_score, recall_score | |
| ) | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.svm import SVC | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.tree import DecisionTreeClassifier | |
| from sklearn.cluster import KMeans | |
| from sklearn.feature_selection import mutual_info_classif | |
| from sklearn.utils import resample | |
| # ========================================================== | |
| # PAGE CONFIG | |
| # ========================================================== | |
| st.set_page_config( | |
| page_title="AI AutoML Platform", | |
| page_icon="🤖", | |
| layout="wide" | |
| ) | |
| # ========================================================== | |
| # SESSION STATE | |
| # ========================================================== | |
| if "history" not in st.session_state: | |
| st.session_state.history = [] | |
| if "last_model_name" not in st.session_state: | |
| st.session_state.last_model_name = None | |
| if "last_score" not in st.session_state: | |
| st.session_state.last_score = None | |
| #store detailed results per model run for reports | |
| if "model_results" not in st.session_state: | |
| st.session_state.model_results = [] | |
| #store selected target so report can reference it | |
| if "selected_target" not in st.session_state: | |
| st.session_state.selected_target = None | |
| # store the cleaned df reference for report generation | |
| if "cleaned_df" not in st.session_state: | |
| st.session_state.cleaned_df = None | |
| # ========================================================== | |
| # THEME CSS | |
| # ========================================================== | |
| st.markdown(""" | |
| <style> | |
| .stApp { | |
| background: linear-gradient(135deg,#0f172a,#111827,#020617); | |
| color: white; | |
| } | |
| .big-title { | |
| font-size: 42px; | |
| font-weight: 800; | |
| color: #38bdf8; | |
| text-align:center; | |
| padding:15px; | |
| } | |
| .sub-title { | |
| text-align:center; | |
| color:#cbd5e1; | |
| font-size:18px; | |
| margin-bottom:25px; | |
| } | |
| .section { | |
| background:#0f172a; | |
| padding:12px; | |
| border-radius:12px; | |
| color:#38bdf8; | |
| font-weight:700; | |
| font-size:24px; | |
| margin-top:20px; | |
| } | |
| .stButton>button { | |
| background:#38bdf8; | |
| color:black; | |
| border:none; | |
| border-radius:10px; | |
| font-weight:700; | |
| } | |
| .stButton>button:hover { | |
| background:#0ea5e9; | |
| color:white; | |
| } | |
| div[data-baseweb="select"] > div { | |
| background:#1e293b !important; | |
| color:white !important; | |
| } | |
| .model-result-box { | |
| background:#1e293b; | |
| padding:20px; | |
| border-radius:12px; | |
| border:2px solid #38bdf8; | |
| margin:15px 0; | |
| } | |
| /* File Uploader Button */ | |
| .stFileUploader>div>div>button { | |
| background:#38bdf8 !important; | |
| color:black !important; | |
| border:none !important; | |
| border-radius:10px !important; | |
| font-weight:700 !important; | |
| } | |
| .stFileUploader>div>div>button:hover { | |
| background:#0ea5e9 !important; | |
| color:white !important; | |
| } | |
| /* File Uploader Button Alternative Selectors */ | |
| .stFileUploader button { | |
| background:#38bdf8 !important; | |
| color:black !important; | |
| border:none !important; | |
| border-radius:10px !important; | |
| font-weight:700 !important; | |
| } | |
| .stFileUploader button:hover { | |
| background:#0ea5e9 !important; | |
| color:white !important; | |
| } | |
| /* Download Buttons */ | |
| .stDownloadButton>button { | |
| background:#38bdf8 !important; | |
| color:black !important; | |
| border:none !important; | |
| border-radius:10px !important; | |
| font-weight:700 !important; | |
| } | |
| .stDownloadButton>button:hover { | |
| background:#0ea5e9 !important; | |
| color:white !important; | |
| } | |
| /* File Uploader Label */ | |
| .stFileUploader label { | |
| color:#38bdf8 !important; | |
| font-size:16px !important; | |
| font-weight:700 !important; | |
| } | |
| /* Selectbox Labels */ | |
| .stSelectbox label { | |
| color:#38bdf8 !important; | |
| font-size:16px !important; | |
| font-weight:700 !important; | |
| } | |
| /* Text and Write Styling */ | |
| p { | |
| color:#cbd5e1; | |
| } | |
| .stWrite { | |
| color:#cbd5e1; | |
| } | |
| /* Center pyplot figures and add lateral padding */ | |
| .stPlotlyChart, .stPyplot { | |
| display: flex; | |
| justify-content: center; | |
| } | |
| .stPyplot { | |
| padding: 0 50px; | |
| } | |
| .stPlotlyChart { | |
| padding: 0 50px; | |
| } | |
| /* Centered containers */ | |
| .stContainer { | |
| max-width: 95%; | |
| margin-left: auto; | |
| margin-right: auto; | |
| } | |
| /* Classification Report Text */ | |
| .stText { | |
| color: white !important; | |
| } | |
| .stText pre { | |
| color: white !important; | |
| } | |
| .stText * { | |
| color: white !important; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # ========================================================== | |
| # HEADER | |
| # ========================================================== | |
| st.markdown('<div class="big-title">🤖 AI AutoML Platform</div>', unsafe_allow_html=True) | |
| st.markdown('<div class="sub-title">upload csv select model download trained model</div>', unsafe_allow_html=True) | |
| # ========================================================== | |
| # HELPERS | |
| # ========================================================== | |
| def smart_clean(df): | |
| df = df.copy() | |
| df = df.drop_duplicates() | |
| for col in df.columns: | |
| if df[col].dtype == "object": | |
| df[col] = df[col].fillna(df[col].mode()[0]) | |
| else: | |
| # use median instead of mean (more robust to outliers) | |
| df[col] = df[col].fillna(df[col].median()) | |
| return df | |
| def convert_units(value): | |
| try: | |
| txt = str(value).lower().strip() | |
| nums = re.findall(r'[\d.]+', txt) | |
| if not nums: | |
| return value | |
| num = float(nums[0]) | |
| if "km" in txt: | |
| return num * 1000 | |
| elif "cm" in txt: | |
| return num / 100 | |
| elif "mm" in txt: | |
| return num / 1000 | |
| elif "m" in txt: | |
| return num | |
| else: | |
| return num | |
| except: | |
| return value | |
| def detect_unit_columns(df): | |
| df = df.copy() | |
| for col in df.columns: | |
| if df[col].dtype == "object": | |
| sample = str(df[col].iloc[0]).lower() | |
| if any(x in sample for x in ["km", "cm", "mm", " m"]): | |
| df[col] = df[col].apply(convert_units) | |
| return df | |
| def detect_best_target(df): | |
| scores = {} | |
| for col in df.columns: | |
| score = 0 | |
| unique = df[col].nunique() | |
| ratio = unique / len(df) | |
| if 2 <= unique <= 15: | |
| score += 6 | |
| if df[col].dtype == "object": | |
| score += 3 | |
| if ratio > 0.9: | |
| score -= 10 | |
| if unique > 50: | |
| score -= 5 | |
| scores[col] = score | |
| best = max(scores, key=scores.get) | |
| ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True) | |
| return best, ranked[:5] | |
| def prepare_for_supervised(df, target): | |
| data = df.copy() | |
| for col in data.columns: | |
| if data[col].dtype == "object": | |
| le = LabelEncoder() | |
| data[col] = le.fit_transform(data[col].astype(str)) | |
| X = data.drop(columns=[target]) | |
| y = data[target] | |
| return X, y, data | |
| # --- ACCURACY HELPER FUNCTIONS --- | |
| def clip_outliers_iqr(df): | |
| """Clip outliers using IQR method instead of removing rows.""" | |
| df = df.copy() | |
| info = {} | |
| for col in df.select_dtypes(include=[np.number]).columns: | |
| Q1 = df[col].quantile(0.25) | |
| Q3 = df[col].quantile(0.75) | |
| IQR = Q3 - Q1 | |
| lower = Q1 - 1.5 * IQR | |
| upper = Q3 + 1.5 * IQR | |
| n_out = ((df[col] < lower) | (df[col] > upper)).sum() | |
| if n_out > 0: | |
| df[col] = df[col].clip(lower=lower, upper=upper) | |
| info[col] = n_out | |
| return df, info | |
| def remove_low_variance(X, threshold=0.01): | |
| """Remove features with near-zero variance.""" | |
| variances = X.var() | |
| low = variances[variances < threshold].index.tolist() | |
| if low: | |
| X = X.drop(columns=low) | |
| return X, low | |
| def remove_high_correlation(X, threshold=0.95): | |
| """Remove one of each pair of highly correlated features.""" | |
| corr = X.corr().abs() | |
| upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool)) | |
| to_drop = [c for c in upper.columns if any(upper[c] > threshold)] | |
| if to_drop: | |
| X = X.drop(columns=to_drop) | |
| return X, to_drop | |
| def balance_classes(X, y): | |
| """Oversample minority classes to match majority count.""" | |
| classes, counts = np.unique(y, return_counts=True) | |
| if len(classes) < 2: | |
| return X, y, False | |
| max_count = counts.max() | |
| ratio = max_count / counts.min() | |
| if ratio < 2: | |
| return X, y, False | |
| X_out = X.copy() | |
| y_out = y.copy() | |
| for cls, cnt in zip(classes, counts): | |
| if cnt < max_count: | |
| idx = y[y == cls].index | |
| extra = resample(X.loc[idx], replace=True, n_samples=max_count - cnt, random_state=42) | |
| y_extra = pd.Series([cls] * (max_count - cnt), index=extra.index) | |
| X_out = pd.concat([X_out, extra]) | |
| y_out = pd.concat([y_out, y_extra]) | |
| return X_out, y_out, True | |
| def select_top_features(X, y, max_features=20): | |
| """Select top features by mutual information.""" | |
| if X.shape[1] <= max_features: | |
| return X, list(X.columns) | |
| mi = mutual_info_classif(X, y, random_state=42) | |
| top = pd.Series(mi, index=X.columns).sort_values(ascending=False).head(max_features).index.tolist() | |
| return X[top], top | |
| def preprocess_for_model(df, target): | |
| """Full accuracy-boosting preprocessing pipeline.""" | |
| X, y, transformed = prepare_for_supervised(df, target) | |
| # Clip outliers | |
| transformed_clipped, outlier_info = clip_outliers_iqr(transformed) | |
| X = transformed_clipped.drop(columns=[target]) | |
| y = transformed_clipped[target] | |
| # Remove low variance | |
| X, low_var = remove_low_variance(X) | |
| # Remove high correlation | |
| X, high_corr = remove_high_correlation(X) | |
| # Balance classes | |
| X, y, balanced = balance_classes(X, y) | |
| # Feature selection | |
| X, selected = select_top_features(X, y) | |
| return X, y, transformed, { | |
| "outliers_clipped": outlier_info, | |
| "low_var_removed": low_var, | |
| "high_corr_removed": high_corr, | |
| "class_balanced": balanced, | |
| "features_used": list(X.columns), | |
| } | |
| def show_confusion(y_true, y_pred, title): | |
| fig, ax = plt.subplots(figsize=(5,4)) | |
| cm = confusion_matrix(y_true, y_pred) | |
| sns.heatmap( | |
| cm, | |
| annot=True, | |
| fmt="d", | |
| cmap="Blues", | |
| linewidths=1 | |
| ) | |
| plt.title(title) | |
| plt.xlabel("Predicted") | |
| plt.ylabel("Actual") | |
| col1, col2, col3 = st.columns([1, 2, 1]) | |
| with col2: | |
| st.pyplot(fig) | |
| return fig | |
| def compact_bar(labels, values, title): | |
| fig, ax = plt.subplots(figsize=(6,3)) | |
| sns.barplot(x=labels, y=values) | |
| plt.xticks(rotation=20) | |
| plt.title(title) | |
| col1, col2, col3 = st.columns([1, 2, 1]) | |
| with col2: | |
| st.pyplot(fig) | |
| return fig | |
| def save_result(name, score, target_col, features_used, extra_info=None): | |
| """Enhanced save_result that stores all details for reporting.""" | |
| st.session_state.last_model_name = name | |
| st.session_state.last_score = score | |
| entry = { | |
| "Model": name, | |
| "Score": score, | |
| "Target": target_col, | |
| "Features": features_used, | |
| "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), | |
| } | |
| if extra_info: | |
| entry.update(extra_info) | |
| st.session_state.history.append(entry) | |
| st.session_state.model_results.append(entry) | |
| # --- REPORT GENERATORS --- | |
| def generate_text_report(df, target, model_results): | |
| """Generate a comprehensive TXT report with every detail.""" | |
| best = max(model_results, key=lambda x: x["Score"]) if model_results else None | |
| lines = [] | |
| lines.append("=" * 70) | |
| lines.append(" DARK AI AUTOML PLATFORM - FULL REPORT") | |
| lines.append("=" * 70) | |
| lines.append(f" Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") | |
| lines.append("") | |
| lines.append("-" * 70) | |
| lines.append(" DATASET SUMMARY") | |
| lines.append("-" * 70) | |
| lines.append(f" Rows: {df.shape[0]}") | |
| lines.append(f" Columns: {df.shape[1]}") | |
| lines.append(f" Target Column: {target}") | |
| lines.append(f" Target Unique Values: {df[target].nunique()}") | |
| lines.append("") | |
| lines.append("-" * 70) | |
| lines.append(" COLUMN DETAILS") | |
| lines.append("-" * 70) | |
| for col in df.columns: | |
| dtype = str(df[col].dtype) | |
| nunique = df[col].nunique() | |
| missing = df[col].isnull().sum() | |
| lines.append(f" {col}: type={dtype}, unique={nunique}, missing={missing}") | |
| lines.append("") | |
| lines.append("-" * 70) | |
| lines.append(" MODEL RESULTS (ALL RUNS)") | |
| lines.append("-" * 70) | |
| for i, r in enumerate(model_results, 1): | |
| lines.append("") | |
| lines.append(f" Run #{i}") | |
| lines.append(f" Model: {r['Model']}") | |
| lines.append(f" Accuracy/Score: {r['Score']:.2f}%") | |
| lines.append(f" Target Feature: {r.get('Target', 'N/A')}") | |
| lines.append(f" Features Used: {r.get('Features', 'N/A')}") | |
| lines.append(f" Timestamp: {r.get('Timestamp', 'N/A')}") | |
| if "Precision" in r: | |
| lines.append(f" Precision: {r['Precision']:.2f}%") | |
| if "Recall" in r: | |
| lines.append(f" Recall: {r['Recall']:.2f}%") | |
| if "F1Score" in r: | |
| lines.append(f" F1 Score: {r['F1Score']:.2f}%") | |
| if "BestParams" in r: | |
| lines.append(f" Best Hyperparameters: {r['BestParams']}") | |
| if "OutliersClipped" in r: | |
| lines.append(f" Outliers Clipped: {r['OutliersClipped']} columns") | |
| if "LowVarRemoved" in r: | |
| lines.append(f" Low Variance Features Removed: {r['LowVarRemoved']}") | |
| if "HighCorrRemoved" in r: | |
| lines.append(f" High Correlation Features Removed: {r['HighCorrRemoved']}") | |
| if "ClassBalanced" in r: | |
| lines.append(f" Class Balancing Applied: {r['ClassBalanced']}") | |
| if "BestK" in r: | |
| lines.append(f" Optimal Clusters (k): {r['BestK']}") | |
| if best: | |
| lines.append("") | |
| lines.append("-" * 70) | |
| lines.append(" BEST MODEL") | |
| lines.append("-" * 70) | |
| lines.append(f" Model: {best['Model']}") | |
| lines.append(f" Score: {best['Score']:.2f}%") | |
| lines.append(f" Target: {best.get('Target', 'N/A')}") | |
| lines.append("") | |
| lines.append("-" * 70) | |
| lines.append(" PREPROCESSING PIPELINE") | |
| lines.append("-" * 70) | |
| lines.append(" - Duplicate removal") | |
| lines.append(" - Missing values handled (median for numeric, mode for categorical)") | |
| lines.append(" - Unit conversion (km/cm/mm -> m)") | |
| lines.append(" - Categorical encoding (LabelEncoder)") | |
| lines.append(" - Outlier clipping (IQR method)") | |
| lines.append(" - Low variance feature removal") | |
| lines.append(" - High correlation feature removal") | |
| lines.append(" - Class imbalance handling (oversampling)") | |
| lines.append(" - Feature selection (mutual information, top 20)") | |
| lines.append(" - Scaling where required (StandardScaler / RobustScaler)") | |
| lines.append(" - Hyperparameter tuning (GridSearchCV)") | |
| lines.append(" - Stratified cross-validation (5-fold)") | |
| lines.append("") | |
| lines.append("=" * 70) | |
| lines.append(" END OF REPORT") | |
| lines.append("=" * 70) | |
| return "\n".join(lines) | |
| def generate_xlsx_report(df, target, model_results): | |
| """Generate a multi-sheet XLSX report with every detail.""" | |
| output = io.BytesIO() | |
| with pd.ExcelWriter(output, engine="openpyxl") as writer: | |
| # Sheet 1: Dataset Summary | |
| summary = pd.DataFrame({ | |
| "Property": ["Rows", "Columns", "Target Column", "Target Unique Values"], | |
| "Value": [df.shape[0], df.shape[1], target, df[target].nunique()] | |
| }) | |
| summary.to_excel(writer, sheet_name="Dataset Summary", index=False) | |
| # Sheet 2: Column Details | |
| col_details = [] | |
| for col in df.columns: | |
| col_details.append({ | |
| "Column": col, | |
| "Type": str(df[col].dtype), | |
| "Unique Values": df[col].nunique(), | |
| "Missing Values": df[col].isnull().sum(), | |
| }) | |
| pd.DataFrame(col_details).to_excel(writer, sheet_name="Column Details", index=False) | |
| # Sheet 3: Model Results | |
| results_df = pd.DataFrame(model_results) | |
| results_df.to_excel(writer, sheet_name="Model Results", index=False) | |
| # Sheet 4: Best Model | |
| if model_results: | |
| best = max(model_results, key=lambda x: x["Score"]) | |
| pd.DataFrame([best]).to_excel(writer, sheet_name="Best Model", index=False) | |
| output.seek(0) | |
| return output | |
| # ========================================================== | |
| # UPLOAD | |
| # ========================================================== | |
| st.markdown('<div class="section">📁 Upload Dataset</div>', unsafe_allow_html=True) | |
| file = st.file_uploader("Upload CSV File", type=["csv"]) | |
| # ========================================================== | |
| # MAIN APP | |
| # ========================================================== | |
| if file: | |
| raw = pd.read_csv(file) | |
| st.markdown('<div class="section">📌 Dataset Preview</div>', unsafe_allow_html=True) | |
| st.dataframe(raw.head(), use_container_width=True) | |
| df = smart_clean(raw) | |
| df = detect_unit_columns(df) | |
| st.session_state.cleaned_df = df | |
| # ------------------------------------------------------ | |
| # TARGET DETECTION | |
| # ------------------------------------------------------ | |
| st.markdown('<div class="section">🎯 AI Target Detection</div>', unsafe_allow_html=True) | |
| best_target, top5 = detect_best_target(df) | |
| st.success(f"Recommended Target Column: {best_target}") | |
| st.write("Top Suggestions:") | |
| for n, s in top5: | |
| st.write(f"• {n} (score: {s})") | |
| # Dropdown with AI recommendation pre-selected, user can override | |
| target = st.selectbox( | |
| "Choose Target Column (AI recommended is pre-selected - change if needed)", | |
| [best_target] + [c for c in df.columns if c != best_target] | |
| ) | |
| st.session_state.selected_target = target | |
| # ------------------------------------------------------ | |
| # MODEL SELECT | |
| # ------------------------------------------------------ | |
| st.markdown('<div class="section">🤖 Choose Model</div>', unsafe_allow_html=True) | |
| model_choice = st.selectbox( | |
| "Select One Model", | |
| [ | |
| "Random Forest", | |
| "SVM", | |
| "Logistic Regression", | |
| "Decision Tree", | |
| "KMeans Clustering" | |
| ] | |
| ) | |
| # ------------------------------------------------------ | |
| # APPLY MODEL | |
| # ------------------------------------------------------ | |
| if st.button("🚀 Apply Model"): | |
| # Each model result is in its own container so | |
| # applying a second model shows results separately beneath the first | |
| # RANDOM FOREST | |
| if model_choice == "Random Forest": | |
| X, y, transformed, pp_info = preprocess_for_model(df, target) | |
| features_used = pp_info["features_used"] | |
| result_box = st.container() | |
| with result_box: | |
| st.markdown('<div class="model-result-box">', unsafe_allow_html=True) | |
| st.markdown(f"### Random Forest Results (Target: {target})") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.write("Original") | |
| st.dataframe(raw.head()) | |
| with col2: | |
| st.write("Processed") | |
| st.dataframe(transformed.head()) | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=0.2, random_state=42, stratify=y | |
| ) | |
| cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) | |
| model = GridSearchCV( | |
| RandomForestClassifier(), | |
| { | |
| "n_estimators":[100,200,300], | |
| "max_depth":[5,10,15,None], | |
| "min_samples_split":[2,5], | |
| "min_samples_leaf":[1,2] | |
| }, | |
| cv=cv, | |
| n_jobs=-1 | |
| ) | |
| model.fit(X_train, y_train) | |
| pred = model.predict(X_test) | |
| acc = accuracy_score(y_test, pred)*100 | |
| prec = precision_score(y_test, pred, average="weighted", zero_division=0)*100 | |
| rec = recall_score(y_test, pred, average="weighted", zero_division=0)*100 | |
| f1 = f1_score(y_test, pred, average="weighted", zero_division=0)*100 | |
| st.success(f"Accuracy: {acc:.2f}%") | |
| st.info(f"Precision: {prec:.2f}% | Recall: {rec:.2f}% | F1: {f1:.2f}%") | |
| show_confusion(y_test, pred, "Random Forest Matrix") | |
| imp = pd.Series( | |
| model.best_estimator_.feature_importances_, | |
| index=X.columns | |
| ).sort_values(ascending=False).head(8) | |
| compact_bar(imp.index, imp.values, "Feature Importance") | |
| st.write("**Classification Report:**") | |
| st.text(classification_report(y_test, pred, zero_division=0)) | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| joblib.dump(model.best_estimator_, "random_forest.pkl") | |
| save_result("Random Forest", acc, target, ", ".join(features_used), { | |
| "Precision": prec, | |
| "Recall": rec, | |
| "F1Score": f1, | |
| "BestParams": str(model.best_params_), | |
| "OutliersClipped": len(pp_info["outliers_clipped"]), | |
| "LowVarRemoved": str(pp_info["low_var_removed"]), | |
| "HighCorrRemoved": str(pp_info["high_corr_removed"]), | |
| "ClassBalanced": pp_info["class_balanced"], | |
| }) | |
| # SVM | |
| elif model_choice == "SVM": | |
| X, y, transformed, pp_info = preprocess_for_model(df, target) | |
| features_used = pp_info["features_used"] | |
| result_box = st.container() | |
| with result_box: | |
| st.markdown('<div class="model-result-box">', unsafe_allow_html=True) | |
| st.markdown(f"### SVM Results (Target: {target})") | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=0.2, random_state=42, stratify=y | |
| ) | |
| # RobustScaler for SVM (handles outliers better) | |
| sc = RobustScaler() | |
| X_train = sc.fit_transform(X_train) | |
| X_test = sc.transform(X_test) | |
| cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) | |
| model = GridSearchCV( | |
| SVC(), | |
| { | |
| "C":[0.1,1,10,100], | |
| "kernel":["rbf","linear","poly"], | |
| "gamma":["scale","auto"] | |
| }, | |
| cv=cv, | |
| n_jobs=-1 | |
| ) | |
| model.fit(X_train, y_train) | |
| pred = model.predict(X_test) | |
| acc = accuracy_score(y_test, pred)*100 | |
| prec = precision_score(y_test, pred, average="weighted", zero_division=0)*100 | |
| rec = recall_score(y_test, pred, average="weighted", zero_division=0)*100 | |
| f1 = f1_score(y_test, pred, average="weighted", zero_division=0)*100 | |
| st.success(f"Accuracy: {acc:.2f}%") | |
| st.info(f"Precision: {prec:.2f}% | Recall: {rec:.2f}% | F1: {f1:.2f}%") | |
| show_confusion(y_test, pred, "SVM Matrix") | |
| st.write("**Classification Report:**") | |
| st.text(classification_report(y_test, pred, zero_division=0)) | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| joblib.dump(model.best_estimator_, "svm.pkl") | |
| save_result("SVM", acc, target, ", ".join(features_used), { | |
| "Precision": prec, | |
| "Recall": rec, | |
| "F1Score": f1, | |
| "BestParams": str(model.best_params_), | |
| "OutliersClipped": len(pp_info["outliers_clipped"]), | |
| "LowVarRemoved": str(pp_info["low_var_removed"]), | |
| "HighCorrRemoved": str(pp_info["high_corr_removed"]), | |
| "ClassBalanced": pp_info["class_balanced"], | |
| }) | |
| # LOGISTIC | |
| elif model_choice == "Logistic Regression": | |
| X, y, transformed, pp_info = preprocess_for_model(df, target) | |
| features_used = pp_info["features_used"] | |
| result_box = st.container() | |
| with result_box: | |
| st.markdown('<div class="model-result-box">', unsafe_allow_html=True) | |
| st.markdown(f"### Logistic Regression Results (Target: {target})") | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=0.2, random_state=42, stratify=y | |
| ) | |
| sc = StandardScaler() | |
| X_train = sc.fit_transform(X_train) | |
| X_test = sc.transform(X_test) | |
| cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) | |
| model = GridSearchCV( | |
| LogisticRegression(max_iter=5000, solver="liblinear"), | |
| { | |
| "C":[0.01,0.1,1,10,100], | |
| "penalty":["l1","l2"] | |
| }, | |
| cv=cv, | |
| n_jobs=-1 | |
| ) | |
| model.fit(X_train, y_train) | |
| pred = model.predict(X_test) | |
| acc = accuracy_score(y_test, pred)*100 | |
| prec = precision_score(y_test, pred, average="weighted", zero_division=0)*100 | |
| rec = recall_score(y_test, pred, average="weighted", zero_division=0)*100 | |
| f1 = f1_score(y_test, pred, average="weighted", zero_division=0)*100 | |
| st.success(f"Accuracy: {acc:.2f}%") | |
| st.info(f"Precision: {prec:.2f}% | Recall: {rec:.2f}% | F1: {f1:.2f}%") | |
| show_confusion(y_test, pred, "Logistic Regression Matrix") | |
| # Show coefficient magnitudes for logistic regression | |
| if hasattr(model.best_estimator_, "coef_"): | |
| coef = pd.Series( | |
| np.abs(model.best_estimator_.coef_[0]), | |
| index=X.columns | |
| ).sort_values(ascending=False).head(8) | |
| compact_bar(coef.index, coef.values, "Feature Coefficients (Absolute)") | |
| st.write("**Classification Report:**") | |
| st.text(classification_report(y_test, pred, zero_division=0)) | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| joblib.dump(model.best_estimator_, "logistic.pkl") | |
| save_result("Logistic Regression", acc, target, ", ".join(features_used), { | |
| "Precision": prec, | |
| "Recall": rec, | |
| "F1Score": f1, | |
| "BestParams": str(model.best_params_), | |
| "OutliersClipped": len(pp_info["outliers_clipped"]), | |
| "LowVarRemoved": str(pp_info["low_var_removed"]), | |
| "HighCorrRemoved": str(pp_info["high_corr_removed"]), | |
| "ClassBalanced": pp_info["class_balanced"], | |
| }) | |
| # DECISION TREE | |
| elif model_choice == "Decision Tree": | |
| X, y, transformed, pp_info = preprocess_for_model(df, target) | |
| features_used = pp_info["features_used"] | |
| result_box = st.container() | |
| with result_box: | |
| st.markdown('<div class="model-result-box">', unsafe_allow_html=True) | |
| st.markdown(f"### Decision Tree Results (Target: {target})") | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=0.2, random_state=42, stratify=y | |
| ) | |
| cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) | |
| model = GridSearchCV( | |
| DecisionTreeClassifier(), | |
| { | |
| "max_depth":[3,5,10,15,None], | |
| "min_samples_split":[2,5,10], | |
| "min_samples_leaf":[1,2,4], | |
| "criterion":["gini","entropy"] | |
| }, | |
| cv=cv, | |
| n_jobs=-1 | |
| ) | |
| model.fit(X_train, y_train) | |
| pred = model.predict(X_test) | |
| acc = accuracy_score(y_test, pred)*100 | |
| prec = precision_score(y_test, pred, average="weighted", zero_division=0)*100 | |
| rec = recall_score(y_test, pred, average="weighted", zero_division=0)*100 | |
| f1 = f1_score(y_test, pred, average="weighted", zero_division=0)*100 | |
| st.success(f"Accuracy: {acc:.2f}%") | |
| st.info(f"Precision: {prec:.2f}% | Recall: {rec:.2f}% | F1: {f1:.2f}%") | |
| show_confusion(y_test, pred, "Decision Tree Matrix") | |
| # Feature importance for decision tree | |
| imp = pd.Series( | |
| model.best_estimator_.feature_importances_, | |
| index=X.columns | |
| ).sort_values(ascending=False).head(8) | |
| compact_bar(imp.index, imp.values, "Feature Importance") | |
| st.write("**Classification Report:**") | |
| st.text(classification_report(y_test, pred, zero_division=0)) | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| joblib.dump(model.best_estimator_, "decision_tree.pkl") | |
| save_result("Decision Tree", acc, target, ", ".join(features_used), { | |
| "Precision": prec, | |
| "Recall": rec, | |
| "F1Score": f1, | |
| "BestParams": str(model.best_params_), | |
| "OutliersClipped": len(pp_info["outliers_clipped"]), | |
| "LowVarRemoved": str(pp_info["low_var_removed"]), | |
| "HighCorrRemoved": str(pp_info["high_corr_removed"]), | |
| "ClassBalanced": pp_info["class_balanced"], | |
| }) | |
| # KMEANS | |
| elif model_choice == "KMeans Clustering": | |
| temp = df.copy() | |
| for col in temp.columns: | |
| if temp[col].dtype == "object": | |
| le = LabelEncoder() | |
| temp[col] = le.fit_transform(temp[col].astype(str)) | |
| X = temp.drop(columns=[target]) | |
| # Clip outliers for clustering too | |
| temp_clipped, outlier_info = clip_outliers_iqr(temp) | |
| X_clipped = temp_clipped.drop(columns=[target]) | |
| sc = StandardScaler() | |
| Xs = sc.fit_transform(X_clipped) | |
| # Find optimal k using elbow method | |
| inertias = [] | |
| K_range = range(2, min(11, len(df) // 10 + 1)) | |
| for k in K_range: | |
| km = KMeans(n_clusters=k, random_state=42, n_init=10) | |
| km.fit(Xs) | |
| inertias.append(km.inertia_) | |
| best_k = 3 | |
| if len(inertias) >= 3: | |
| diffs = [inertias[i] - inertias[i+1] for i in range(len(inertias)-1)] | |
| if diffs: | |
| elbow_idx = np.argmax(diffs) + 1 | |
| best_k = list(K_range)[elbow_idx] if elbow_idx < len(list(K_range)) else 3 | |
| best_k = max(2, min(best_k, 10)) | |
| result_box = st.container() | |
| with result_box: | |
| st.markdown('<div class="model-result-box">', unsafe_allow_html=True) | |
| st.markdown(f"### KMeans Clustering Results (Target: {target})") | |
| model = KMeans(n_clusters=best_k, random_state=42, n_init=10) | |
| cluster = model.fit_predict(Xs) | |
| score = silhouette_score(Xs, cluster)*100 | |
| st.success(f"Cluster Quality Score: {score:.2f}% (k={best_k})") | |
| fig, ax = plt.subplots(figsize=(6,4)) | |
| plt.scatter(Xs[:,0], Xs[:,1], c=cluster, cmap="viridis") | |
| plt.title(f"Clusters (k={best_k})") | |
| col1, col2, col3 = st.columns([1, 2, 1]) | |
| with col2: | |
| st.pyplot(fig) | |
| # Elbow plot | |
| fig2, ax2 = plt.subplots(figsize=(6,3)) | |
| plt.plot(list(K_range), inertias, "bo-") | |
| plt.xlabel("Number of Clusters (k)") | |
| plt.ylabel("Inertia") | |
| plt.title("Elbow Method") | |
| col1, col2, col3 = st.columns([1, 2, 1]) | |
| with col2: | |
| st.pyplot(fig2) | |
| # Cluster distribution | |
| cluster_counts = pd.Series(cluster).value_counts().sort_index() | |
| fig3, ax3 = plt.subplots(figsize=(6,3)) | |
| sns.barplot(x=cluster_counts.index, y=cluster_counts.values) | |
| plt.xlabel("Cluster") | |
| plt.ylabel("Count") | |
| plt.title("Cluster Distribution") | |
| col1, col2, col3 = st.columns([1, 2, 1]) | |
| with col2: | |
| st.pyplot(fig3) | |
| st.markdown('</div>', unsafe_allow_html=True) | |
| joblib.dump(model, "kmeans.pkl") | |
| save_result("KMeans Clustering", score, target, ", ".join(X_clipped.columns), { | |
| "BestK": best_k, | |
| "OutliersClipped": len(outlier_info), | |
| }) | |
| # ========================================================== | |
| # DOWNLOAD SECTION | |
| # ========================================================== | |
| if st.session_state.last_model_name: | |
| st.markdown('<div class="section">⬇ Downloads</div>', unsafe_allow_html=True) | |
| file_map = { | |
| "Random Forest":"random_forest.pkl", | |
| "SVM":"svm.pkl", | |
| "Logistic Regression":"logistic.pkl", | |
| "Decision Tree":"decision_tree.pkl", | |
| "KMeans Clustering":"kmeans.pkl" | |
| } | |
| current = file_map[st.session_state.last_model_name] | |
| if os.path.exists(current): | |
| with open(current, "rb") as f: | |
| st.download_button( | |
| label=f"Download {st.session_state.last_model_name} (Deploy Ready)", | |
| data=f, | |
| file_name=current, | |
| mime="application/octet-stream" | |
| ) | |
| # ========================================================== | |
| # HISTORY + REPORTS | |
| # ========================================================== | |
| if len(st.session_state.history) > 0: | |
| st.markdown('<div class="section">📊 History</div>', unsafe_allow_html=True) | |
| hist = pd.DataFrame(st.session_state.history) | |
| st.dataframe(hist, use_container_width=True) | |
| fig, ax = plt.subplots(figsize=(6,3)) | |
| sns.barplot(data=hist, x="Model", y="Score") | |
| plt.xticks(rotation=20) | |
| plt.title("All Applied Models") | |
| col1, col2, col3 = st.columns([1, 2, 1]) | |
| with col2: | |
| st.pyplot(fig) | |
| # CSV | |
| csv_buffer = io.StringIO() | |
| hist.to_csv(csv_buffer, index=False) | |
| st.download_button( | |
| "Download Results CSV", | |
| csv_buffer.getvalue(), | |
| "results.csv" | |
| ) | |
| # TXT report | |
| if st.session_state.cleaned_df is not None and len(st.session_state.model_results) > 0: | |
| report_text = generate_text_report( | |
| st.session_state.cleaned_df, | |
| st.session_state.selected_target or "unknown", | |
| st.session_state.model_results | |
| ) | |
| st.download_button( | |
| "Download Full Report (TXT)", | |
| report_text, | |
| "full_report.txt", | |
| mime="text/plain" | |
| ) | |
| # XLSX report | |
| try: | |
| xlsx_data = generate_xlsx_report( | |
| st.session_state.cleaned_df, | |
| st.session_state.selected_target or "unknown", | |
| st.session_state.model_results | |
| ) | |
| st.download_button( | |
| "Download Full Report (XLSX)", | |
| data=xlsx_data.getvalue(), | |
| file_name="full_report.xlsx", | |
| mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | |
| ) | |
| except Exception: | |
| pass | |
| # ========================================================== | |
| # RESET | |
| # ========================================================== | |
| st.markdown('<div class="section">♻ Reset</div>', unsafe_allow_html=True) | |
| if st.button("Clear History"): | |
| st.session_state.history = [] | |
| st.session_state.last_model_name = None | |
| st.session_state.last_score = None | |
| st.session_state.model_results = [] | |
| st.session_state.selected_target = None | |
| st.session_state.cleaned_df = None | |
| st.success("History Cleared") | |