import xgboost as xgb
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, accuracy_score

class MLEngine:
    def __init__(self, models_dir="models"):
        self.models_dir = models_dir
        if not os.path.exists(models_dir):
            os.makedirs(models_dir)
            
        self.model_home = None
        self.model_away = None
        self.model_btts = None
        
        self.load_models()

    def load_models(self):
        """Loads trained models from disk if they exist."""
        try:
            self.model_home = joblib.load(os.path.join(self.models_dir, "xgb_home_goals.joblib"))
            self.model_away = joblib.load(os.path.join(self.models_dir, "xgb_away_goals.joblib"))
            self.model_btts = joblib.load(os.path.join(self.models_dir, "xgb_btts.joblib"))
            print("Models loaded successfully.")
        except FileNotFoundError:
            print("No trained models found. Training needed.")

    def train(self, df: pd.DataFrame):
        """
        Trains the XGBoost models on the provided dataframe.
        df should contain features AND target columns (placar_casa, placar_fora).
        """
        if df.empty or len(df) < 10:
            print("Not enough data to train models.")
            return {"status": "skipped", "reason": "insufficient_data"}

        # Define features and targets
        # Exclude non-numeric or target columns from features
        exclude_cols = ['id_jogo', 'data_jogo', 'time_casa', 'time_fora', 'placar_casa', 'placar_fora', 'match_result']
        # Also exclude potential duplicate columns from merges (ending in _x or _y)
        exclude_cols += [c for c in df.columns if c.endswith('_x') or c.endswith('_y')]
        
        # Also exclude any column that looks like a target or isn't numeric
        feature_cols = [c for c in df.columns if c not in exclude_cols]
        
        # Ensure only numeric columns are used (pandas object types can break xgboost)
        X = df[feature_cols].select_dtypes(include=[np.number])
        
        # DEBUG: Check feature variance
        print("DEBUG: Feature Variance Check (first 5 columns):")
        print(X.var().head())
        print("DEBUG: Sample Row (Last Row in Dataset - Should have data):")
        print(X.iloc[-1]) # Show the last row instead of the first
        
        feature_names = X.columns.tolist() # Save feature names
        
        y_home = df['placar_casa']
        y_away = df['placar_fora']
        y_btts = ((df['placar_casa'] > 0) & (df['placar_fora'] > 0)).astype(int)

        # Split data
        X_train, X_test, y_h_train, y_h_test = train_test_split(X, y_home, test_size=0.2, random_state=42)
        _, _, y_a_train, y_a_test = train_test_split(X, y_away, test_size=0.2, random_state=42)
        _, _, y_b_train, y_b_test = train_test_split(X, y_btts, test_size=0.2, random_state=42)

        # Train Home Goals Model (Regression)
        # Optimized hyperparameters for robustness
        # Increase max_depth slightly to capture more complex patterns
        self.model_home = xgb.XGBRegressor(
            objective='reg:squarederror',
            n_estimators=2000, # Increased further
            learning_rate=0.005, # Very low learning rate to capture subtle differences
            max_depth=4, # Back to 4 to prevent over-segmentation of noise
            subsample=0.6, 
            colsample_bytree=0.8,
            n_jobs=-1,
            early_stopping_rounds=50, # More patience
            reg_alpha=0, # Remove L1 regularization to allow small features
            reg_lambda=0.1, # Minimal L2 regularization
            min_child_weight=1 # Allow splits on smaller differences
        )
        # For early stopping we need validation set, using a part of training set
        X_tr, X_val, y_h_tr, y_h_val = train_test_split(X_train, y_h_train, test_size=0.1, random_state=42)
        self.model_home.fit(X_tr, y_h_tr, eval_set=[(X_val, y_h_val)], verbose=False)
        
        # Train Away Goals Model (Regression)
        self.model_away = xgb.XGBRegressor(
            objective='reg:squarederror',
            n_estimators=2000,
            learning_rate=0.005,
            max_depth=4,
            subsample=0.6,
            colsample_bytree=0.8,
            n_jobs=-1,
            early_stopping_rounds=50,
            reg_alpha=0,
            reg_lambda=0.1,
            min_child_weight=1
        )
        X_tr_a, X_val_a, y_a_tr, y_a_val = train_test_split(X_train, y_a_train, test_size=0.1, random_state=42)
        self.model_away.fit(X_tr_a, y_a_tr, eval_set=[(X_val_a, y_a_val)], verbose=False)
        
        # Train BTTS Model (Classification)
        self.model_btts = xgb.XGBClassifier(
            objective='binary:logistic',
            n_estimators=1000,
            learning_rate=0.01,
            max_depth=6,
            subsample=0.7,
            colsample_bytree=0.7,
            n_jobs=-1,
            eval_metric='logloss',
            early_stopping_rounds=20,
            reg_alpha=0.1,
            reg_lambda=1.0
        )
        X_tr_b, X_val_b, y_b_tr, y_b_val = train_test_split(X_train, y_b_train, test_size=0.1, random_state=42)
        self.model_btts.fit(X_tr_b, y_b_tr, eval_set=[(X_val_b, y_b_val)], verbose=False)

        # Evaluate
        home_preds = self.model_home.predict(X_test)
        away_preds = self.model_away.predict(X_test)
        btts_preds = self.model_btts.predict(X_test)
        
        metrics = {
            "mae_home": mean_absolute_error(y_h_test, home_preds),
            "mae_away": mean_absolute_error(y_a_test, away_preds),
            "acc_btts": accuracy_score(y_b_test, btts_preds),
            "samples": len(df)
        }
        
        # Save models and feature names
        joblib.dump(self.model_home, os.path.join(self.models_dir, "xgb_home_goals.joblib"))
        joblib.dump(self.model_away, os.path.join(self.models_dir, "xgb_away_goals.joblib"))
        joblib.dump(self.model_btts, os.path.join(self.models_dir, "xgb_btts.joblib"))
        joblib.dump(feature_names, os.path.join(self.models_dir, "feature_names.joblib"))
        
        print(f"Training complete. Metrics: {metrics}")
        return metrics

    def predict(self, features_df: pd.DataFrame):
        """
        Predicts home goals, away goals, and BTTS probability.
        """
        if self.model_home is None:
            raise ValueError("Models not trained/loaded.")
            
        # Load expected feature names if available to ensure alignment
        try:
            expected_features = joblib.load(os.path.join(self.models_dir, "feature_names.joblib"))
            # Filter and reorder columns
            # Add missing columns as 0
            for col in expected_features:
                if col not in features_df.columns:
                    features_df[col] = 0
            
            features_df = features_df[expected_features]
        except FileNotFoundError:
            # Fallback if no feature names saved (older model)
            pass
            
        # Ensure feature order matches training
        # XGBoost is sensitive to column order if using DMatrix, but sklearn API handles it better if dataframe passed
        # Still good practice to align
        
        home_goals = self.model_home.predict(features_df)[0]
        away_goals = self.model_away.predict(features_df)[0]
        btts_prob = self.model_btts.predict_proba(features_df)[0][1] # Probability of class 1
        
        # Clip negative predictions (impossible goals)
        home_goals = max(0.01, home_goals)
        away_goals = max(0.01, away_goals)
        
        return {
            "expected_home_goals": float(home_goals),
            "expected_away_goals": float(away_goals),
            "btts_prob": float(btts_prob)
        }
