Domanda

image.png
image.png
image.png
image.png
image.png
image.png
image.png
image.png

give me jupyter notbook code

Pensiero

Writing notebook code for ML analysis

The user asks for Jupyter notebook code based on images, but I can’t run code without data files. I’ll prepare code that covers EDA, preprocessing, and machine learning tasks like weighted F1, multiple models, feature importance, and outputting submissions.csv. Since only screenshots are available, the code will be ready for them to run.

Thought for 14s

Risposta

Paste these cells into your Jupyter notebook.

python
# Libraries import pandas as pd import numpy as np from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn.impute import SimpleImputer from sklearn.metrics import f1_score, classification_report, confusion_matrix from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier from sklearn.linear_model import LogisticRegression pd.set_option("display.max_columns", 100) pd.set_option("display.max_colwidth", 100)
python
# Load data train = pd.read_csv("train.csv") test = pd.read_csv("test.csv") print(train.shape) print(test.shape) display(train.head()) display(test.head())
python
# Basic EDA print(train.info()) print("\nMissing values in train:") display(train.isna().mean().sort_values(ascending=False)) print("\nTarget distribution:") display(train["outcome"].value_counts(normalize=True)) print("\nDuplicate rows:", train.duplicated().sum())
python
# Feature engineering function def prepare_features(df): df = df.copy() # Convert timestamp df["request_timestamp"] = pd.to_datetime(df["request_timestamp"], errors="coerce") # Time-based features df["request_hour"] = df["request_timestamp"].dt.hour df["request_day"] = df["request_timestamp"].dt.day df["request_dayofweek"] = df["request_timestamp"].dt.dayofweek df["request_month"] = df["request_timestamp"].dt.month df["request_minute"] = df["request_timestamp"].dt.minute df["is_weekend"] = df["request_dayofweek"].isin([5, 6]).astype(int) # Cyclic time features df["hour_sin"] = np.sin(2 * np.pi * df["request_hour"] / 24) df["hour_cos"] = np.cos(2 * np.pi * df["request_hour"] / 24) df["dow_sin"] = np.sin(2 * np.pi * df["request_dayofweek"] / 7) df["dow_cos"] = np.cos(2 * np.pi * df["request_dayofweek"] / 7) # Location-based categorical features df["pickup_location_id"] = df["pickup_location_id"].astype("object") df["drop_location_id"] = df["drop_location_id"].astype("object") df["route"] = ( df["pickup_location_id"].astype(str) + "_" + df["drop_location_id"].astype(str) ) # Trip-related engineered features df["total_expected_time"] = ( df["expected_time_to_pickup"] + df["expected_trip_duration"] ) df["pickup_to_trip_ratio"] = ( df["expected_time_to_pickup"] / (df["expected_trip_duration"] + 1) ) df["trip_to_pickup_ratio"] = ( df["expected_trip_duration"] / (df["expected_time_to_pickup"] + 1) ) # Drop original timestamp because we already extracted useful features df = df.drop(columns=["request_timestamp"], errors="ignore") return df
python
# Separate target and IDs target = "outcome" y = train[target] X = train.drop(columns=[target]) test_ids = test["id"] X = prepare_features(X) X_test = prepare_features(test) display(X.head())
python
# Drop columns with more than 50% missing values missing_ratio = X.isna().mean() cols_to_drop_missing = missing_ratio[missing_ratio > 0.50].index.tolist() print("Columns dropped due to >50% missing:") print(cols_to_drop_missing) X = X.drop(columns=cols_to_drop_missing, errors="ignore") X_test = X_test.drop(columns=cols_to_drop_missing, errors="ignore")
python
# Drop constant columns constant_cols = [col for col in X.columns if X[col].nunique(dropna=False) <= 1] print("Constant columns dropped:") print(constant_cols) X = X.drop(columns=constant_cols, errors="ignore") X_test = X_test.drop(columns=constant_cols, errors="ignore")
python
# Handle unwanted values in expected_trip_duration # Replace impossible values like 0 or negative duration with NaN. if "expected_trip_duration" in X.columns: X.loc[X["expected_trip_duration"] <= 0, "expected_trip_duration"] = np.nan if "expected_trip_duration" in X_test.columns: X_test.loc[X_test["expected_trip_duration"] <= 0, "expected_trip_duration"] = np.nan
python
# Identify numeric and categorical columns numeric_features = X.select_dtypes(include=["int64", "float64", "int32", "float32"]).columns.tolist() categorical_features = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist() print("Numeric features:", numeric_features) print("Categorical features:", categorical_features)
python
# Preprocessing pipeline numeric_transformer = Pipeline(steps=[ ("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler()) ]) categorical_transformer = Pipeline(steps=[ ("imputer", SimpleImputer(strategy="most_frequent")), ("encoder", OneHotEncoder(handle_unknown="ignore")) ]) preprocessor = ColumnTransformer( transformers=[ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features) ] )
python
# Train-validation split X_train, X_valid, y_train, y_valid = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) print(X_train.shape, X_valid.shape)
python
# Try multiple models models = { "Logistic Regression": LogisticRegression( max_iter=2000, class_weight="balanced", random_state=42 ), "Random Forest": RandomForestClassifier( n_estimators=500, max_depth=None, min_samples_split=4, min_samples_leaf=2, class_weight="balanced", random_state=42, n_jobs=-1 ), "Extra Trees": ExtraTreesClassifier( n_estimators=700, max_depth=None, min_samples_split=3, min_samples_leaf=1, class_weight="balanced", random_state=42, n_jobs=-1 ), "Gradient Boosting": GradientBoostingClassifier( n_estimators=300, learning_rate=0.05, max_depth=3, random_state=42 ) } results = {} for name, model in models.items(): pipe = Pipeline(steps=[ ("preprocessor", preprocessor), ("model", model) ]) pipe.fit(X_train, y_train) preds = pipe.predict(X_valid) score = f1_score(y_valid, preds, average="weighted") results[name] = score print(f"{name}: Weighted F1 = {score:.5f}")
python
# Select best model best_model_name = max(results, key=results.get) best_score = results[best_model_name] print("Best model:", best_model_name) print("Best validation weighted F1:", best_score)
python
# Refit best model best_model = models[best_model_name] final_pipeline = Pipeline(steps=[ ("preprocessor", preprocessor), ("model", best_model) ]) final_pipeline.fit(X_train, y_train) valid_preds = final_pipeline.predict(X_valid) print("Validation Weighted F1:", f1_score(y_valid, valid_preds, average="weighted")) print("\nClassification Report:") print(classification_report(y_valid, valid_preds)) print("\nConfusion Matrix:") display(pd.DataFrame( confusion_matrix(y_valid, valid_preds), index=sorted(y.unique()), columns=sorted(y.unique()) ))
python
# Cross-validation score on full training set cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) cv_scores = cross_val_score( final_pipeline, X, y, cv=cv, scoring="f1_weighted", n_jobs=-1 ) print("CV weighted F1 scores:", cv_scores) print("Mean CV weighted F1:", cv_scores.mean()) print("Std CV weighted F1:", cv_scores.std())
python
# Train final model on full training data final_pipeline.fit(X, y)
python
# Feature importance for tree-based models model = final_pipeline.named_steps["model"] try: feature_names = final_pipeline.named_steps["preprocessor"].get_feature_names_out() if hasattr(model, "feature_importances_"): importances = model.feature_importances_ importance_df = pd.DataFrame({ "feature": feature_names, "importance": importances }).sort_values("importance", ascending=False) display(importance_df.head(25)) else: print("This model does not provide feature_importances_.") except Exception as e: print("Could not extract feature importances:", e)
python
# Predict test set test_preds = final_pipeline.predict(X_test) submission_df = pd.DataFrame({ "id": test_ids, "outcome": test_preds }) display(submission_df.head()) print(submission_df["outcome"].value_counts())
python
# Save submission file submission_df.to_csv("submissions.csv", index=False) print("submissions.csv created successfully!") print(submission_df.shape)
python
# Check final submission format check = pd.read_csv("submissions.csv") print(check.head()) print(check.columns.tolist()) print(check.shape) assert list(check.columns) == ["id", "outcome"] assert check.shape[0] == test.shape[0] assert check["outcome"].isin(["low", "medium", "high"]).all() print("Submission format is correct.")

Domanda

can u optimize this

Pensiero

Improving feature engineering and model optimization

I'm thinking about features like total expected time, pickup ratio, and duration bins, and adding time-based and log1p features. I'll drop overly missing or constant features. For duration binning, I prefer fixed bins (like [-inf,10,20,30,45,60,90,inf]) using pd.cut. The model will use RandomizedSearchCV with ExtraTrees for optimized weighted F1 scoring, considering GradientBoosting for comparison. I'll ensure the code is complete but concise enough for the user to easily paste and run.

Thought for 50s

Risposta

Yes. Replace your previous modeling code with this optimized version. It adds better features and uses hyperparameter tuning for ExtraTreesClassifier, which usually works well for this kind of tabular classification.

python
# Libraries import pandas as pd import numpy as np from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn.impute import SimpleImputer from sklearn.metrics import f1_score, classification_report, confusion_matrix from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier pd.set_option("display.max_columns", 100) pd.set_option("display.max_colwidth", 100)
python
# Load data train = pd.read_csv("train.csv") test = pd.read_csv("test.csv") print(train.shape) print(test.shape) display(train.head()) display(test.head())
python
# Optimized feature engineering def create_features(df): df = df.copy() df["request_timestamp"] = pd.to_datetime(df["request_timestamp"], errors="coerce") # Time features df["hour"] = df["request_timestamp"].dt.hour df["day"] = df["request_timestamp"].dt.day df["dayofweek"] = df["request_timestamp"].dt.dayofweek df["month"] = df["request_timestamp"].dt.month df["minute"] = df["request_timestamp"].dt.minute df["is_weekend"] = df["dayofweek"].isin([5, 6]).astype(int) df["is_peak_hour"] = df["hour"].isin([7, 8, 9, 17, 18, 19, 20]).astype(int) # Cyclic time features df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24) df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24) df["dow_sin"] = np.sin(2 * np.pi * df["dayofweek"] / 7) df["dow_cos"] = np.cos(2 * np.pi * df["dayofweek"] / 7) # Handle unwanted trip duration values df.loc[df["expected_trip_duration"] <= 0, "expected_trip_duration"] = np.nan # Duration features df["total_expected_time"] = df["expected_time_to_pickup"] + df["expected_trip_duration"] df["pickup_trip_ratio"] = df["expected_time_to_pickup"] / (df["expected_trip_duration"] + 1) df["trip_pickup_ratio"] = df["expected_trip_duration"] / (df["expected_time_to_pickup"] + 1) df["log_expected_trip_duration"] = np.log1p(df["expected_trip_duration"]) df["log_expected_time_to_pickup"] = np.log1p(df["expected_time_to_pickup"]) df["log_total_expected_time"] = np.log1p(df["total_expected_time"]) # Binned duration features df["trip_duration_bin"] = pd.cut( df["expected_trip_duration"], bins=[-np.inf, 10, 20, 30, 45, 60, 90, np.inf], labels=False ) df["pickup_time_bin"] = pd.cut( df["expected_time_to_pickup"], bins=[-np.inf, 3, 6, 10, 15, 20, np.inf], labels=False ) # Location features df["pickup_location_id"] = df["pickup_location_id"].astype(str) df["drop_location_id"] = df["drop_location_id"].astype(str) df["route"] = df["pickup_location_id"] + "_" + df["drop_location_id"] df["same_pickup_drop"] = (df["pickup_location_id"] == df["drop_location_id"]).astype(int) # Drop raw timestamp df = df.drop(columns=["request_timestamp"], errors="ignore") return df
python
# Separate target and IDs y = train["outcome"] test_ids = test["id"] X = train.drop(columns=["outcome", "id"]) X_test = test.drop(columns=["id"]) X = create_features(X) X_test = create_features(X_test) display(X.head())
python
# Drop columns with more than 50% missing values missing_ratio = X.isna().mean() drop_missing_cols = missing_ratio[missing_ratio > 0.50].index.tolist() print("Dropped columns with >50% missing:", drop_missing_cols) X = X.drop(columns=drop_missing_cols, errors="ignore") X_test = X_test.drop(columns=drop_missing_cols, errors="ignore")
python
# Drop constant columns constant_cols = [col for col in X.columns if X[col].nunique(dropna=False) <= 1] print("Dropped constant columns:", constant_cols) X = X.drop(columns=constant_cols, errors="ignore") X_test = X_test.drop(columns=constant_cols, errors="ignore")
python
# Identify numeric and categorical columns numeric_features = X.select_dtypes(include=["int64", "float64", "int32", "float32"]).columns.tolist() categorical_features = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist() print("Numeric features:", numeric_features) print("Categorical features:", categorical_features)
python
# Compatible OneHotEncoder for different sklearn versions try: ohe = OneHotEncoder(handle_unknown="ignore", min_frequency=5, sparse_output=False) except TypeError: ohe = OneHotEncoder(handle_unknown="ignore", sparse=False) numeric_transformer = Pipeline(steps=[ ("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler()) ]) categorical_transformer = Pipeline(steps=[ ("imputer", SimpleImputer(strategy="most_frequent")), ("encoder", ohe) ]) preprocessor = ColumnTransformer( transformers=[ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features) ] )
python
# Train-validation split X_train, X_valid, y_train, y_valid = train_test_split( X, y, test_size=0.20, random_state=42, stratify=y ) print(X_train.shape, X_valid.shape)
python
# Tuned Extra Trees model base_model = ExtraTreesClassifier( random_state=42, n_jobs=-1 ) pipeline = Pipeline(steps=[ ("preprocessor", preprocessor), ("model", base_model) ]) param_grid = { "model__n_estimators": [300, 500, 700], "model__max_depth": [None, 8, 12, 18, 25, 35], "model__min_samples_split": [2, 3, 5, 8, 10], "model__min_samples_leaf": [1, 2, 3, 4], "model__max_features": ["sqrt", "log2", 0.5, None], "model__class_weight": [None, "balanced", "balanced_subsample"] } cv = StratifiedKFold( n_splits=4, shuffle=True, random_state=42 ) search = RandomizedSearchCV( estimator=pipeline, param_distributions=param_grid, n_iter=25, scoring="f1_weighted", cv=cv, random_state=42, n_jobs=-1, verbose=1 ) search.fit(X_train, y_train) print("Best CV score:", search.best_score_) print("Best parameters:") print(search.best_params_)
python
# Validation performance best_model = search.best_estimator_ valid_preds = best_model.predict(X_valid) print("Validation Weighted F1:", f1_score(y_valid, valid_preds, average="weighted")) print("\nClassification Report:") print(classification_report(y_valid, valid_preds)) print("\nConfusion Matrix:") display(pd.DataFrame( confusion_matrix(y_valid, valid_preds), index=sorted(y.unique()), columns=sorted(y.unique()) ))
python
# Optional: compare with tuned Random Forest quickly rf_model = RandomForestClassifier( n_estimators=600, max_depth=None, min_samples_split=4, min_samples_leaf=2, max_features="sqrt", class_weight="balanced_subsample", random_state=42, n_jobs=-1 ) rf_pipeline = Pipeline(steps=[ ("preprocessor", preprocessor), ("model", rf_model) ]) rf_pipeline.fit(X_train, y_train) rf_preds = rf_pipeline.predict(X_valid) rf_score = f1_score(y_valid, rf_preds, average="weighted") et_score = f1_score(y_valid, valid_preds, average="weighted") print("Extra Trees Weighted F1:", et_score) print("Random Forest Weighted F1:", rf_score) if rf_score > et_score: final_model = rf_pipeline print("Using Random Forest as final model.") else: final_model = best_model print("Using Extra Trees as final model.")
python
# Train final selected model on full training data final_model.fit(X, y)
python
# Predict test data test_preds = final_model.predict(X_test) submission_df = pd.DataFrame({ "id": test_ids, "outcome": test_preds }) display(submission_df.head()) print(submission_df["outcome"].value_counts())
python
# Save submission submission_df.to_csv("submissions.csv", index=False) print("submissions.csv created successfully!") print(submission_df.shape)
python
# Final submission check check = pd.read_csv("submissions.csv") print(check.head()) print(check.columns.tolist()) print(check.shape) assert list(check.columns) == ["id", "outcome"] assert check.shape[0] == test.shape[0] assert check["outcome"].isin(["low", "medium", "high"]).all() print("Submission format is correct.")

For a faster run, reduce this line:

python
n_iter=25

to:

python
n_iter=10

Domanda

more optimistic solution give me

Pensiero

Providing solution for scoring optimization

It seems the user wants an optimistic solution, possibly with the best score. I could offer an advanced solution using XGBoost, but some packages may not be installed. I'll consider using alternatives like sklearn's HistGradientBoosting, targeting robust performance, maybe combining models with ensemble methods, too. I’ll suggest an adaptable approach using target encoding and cross-validation.

Thought for 26s

Risposta

Use this more aggressive / high-score attempt. It uses stronger feature engineering + a soft-voting ensemble. Paste it as your notebook solution.

python
# Libraries import pandas as pd import numpy as np from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn.impute import SimpleImputer from sklearn.metrics import f1_score, classification_report, confusion_matrix from sklearn.ensemble import ( ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, VotingClassifier ) from sklearn.linear_model import LogisticRegression pd.set_option("display.max_columns", 100)
python
# Load data train = pd.read_csv("train.csv") test = pd.read_csv("test.csv") print("Train shape:", train.shape) print("Test shape:", test.shape) display(train.head()) display(test.head())
python
# Create combined data only for frequency encoding combined = pd.concat( [ train.drop(columns=["outcome"], errors="ignore"), test ], axis=0 ).copy() combined["pickup_location_id"] = combined["pickup_location_id"].astype(str) combined["drop_location_id"] = combined["drop_location_id"].astype(str) combined["route"] = combined["pickup_location_id"] + "_" + combined["drop_location_id"] pickup_freq = combined["pickup_location_id"].value_counts(normalize=True).to_dict() drop_freq = combined["drop_location_id"].value_counts(normalize=True).to_dict() route_freq = combined["route"].value_counts(normalize=True).to_dict() pickup_count = combined["pickup_location_id"].value_counts().to_dict() drop_count = combined["drop_location_id"].value_counts().to_dict() route_count = combined["route"].value_counts().to_dict()
python
def make_features(df): df = df.copy() # Timestamp features df["request_timestamp"] = pd.to_datetime(df["request_timestamp"], errors="coerce") df["hour"] = df["request_timestamp"].dt.hour df["minute"] = df["request_timestamp"].dt.minute df["day"] = df["request_timestamp"].dt.day df["dayofweek"] = df["request_timestamp"].dt.dayofweek df["month"] = df["request_timestamp"].dt.month df["weekofyear"] = df["request_timestamp"].dt.isocalendar().week.astype(float) df["is_weekend"] = df["dayofweek"].isin([5, 6]).astype(int) df["is_morning_peak"] = df["hour"].isin([7, 8, 9]).astype(int) df["is_evening_peak"] = df["hour"].isin([17, 18, 19, 20]).astype(int) df["is_peak_hour"] = ((df["is_morning_peak"] == 1) | (df["is_evening_peak"] == 1)).astype(int) df["is_night"] = df["hour"].isin([0, 1, 2, 3, 4, 5]).astype(int) # Cyclic time features df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24) df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24) df["minute_sin"] = np.sin(2 * np.pi * df["minute"] / 60) df["minute_cos"] = np.cos(2 * np.pi * df["minute"] / 60) df["dow_sin"] = np.sin(2 * np.pi * df["dayofweek"] / 7) df["dow_cos"] = np.cos(2 * np.pi * df["dayofweek"] / 7) # Clean duration values df.loc[df["expected_trip_duration"] <= 0, "expected_trip_duration"] = np.nan df.loc[df["expected_time_to_pickup"] < 0, "expected_time_to_pickup"] = np.nan # Core duration features df["total_expected_time"] = df["expected_time_to_pickup"] + df["expected_trip_duration"] df["trip_minus_pickup"] = df["expected_trip_duration"] - df["expected_time_to_pickup"] df["trip_plus_pickup"] = df["expected_trip_duration"] + df["expected_time_to_pickup"] df["pickup_trip_ratio"] = df["expected_time_to_pickup"] / (df["expected_trip_duration"] + 1) df["trip_pickup_ratio"] = df["expected_trip_duration"] / (df["expected_time_to_pickup"] + 1) df["log_trip_duration"] = np.log1p(df["expected_trip_duration"]) df["log_pickup_time"] = np.log1p(df["expected_time_to_pickup"]) df["log_total_time"] = np.log1p(df["total_expected_time"]) df["duration_squared"] = df["expected_trip_duration"] ** 2 df["pickup_squared"] = df["expected_time_to_pickup"] ** 2 df["duration_pickup_interaction"] = df["expected_trip_duration"] * df["expected_time_to_pickup"] # Binned features df["duration_bin"] = pd.cut( df["expected_trip_duration"], bins=[-np.inf, 10, 20, 30, 40, 50, 60, 80, np.inf], labels=[ "very_short", "short", "medium_short", "medium", "medium_long", "long", "very_long", "extreme" ] ).astype(str) df["pickup_bin"] = pd.cut( df["expected_time_to_pickup"], bins=[-np.inf, 3, 6, 10, 15, 20, np.inf], labels=[ "pickup_very_fast", "pickup_fast", "pickup_normal", "pickup_slow", "pickup_very_slow", "pickup_extreme" ] ).astype(str) df["total_time_bin"] = pd.cut( df["total_expected_time"], bins=[-np.inf, 20, 35, 50, 70, 90, np.inf], labels=[ "total_very_low", "total_low", "total_medium", "total_high", "total_very_high", "total_extreme" ] ).astype(str) # Location features df["pickup_location_id"] = df["pickup_location_id"].astype(str) df["drop_location_id"] = df["drop_location_id"].astype(str) df["route"] = df["pickup_location_id"] + "_" + df["drop_location_id"] df["same_pickup_drop"] = (df["pickup_location_id"] == df["drop_location_id"]).astype(int) # Numeric location relationship df["pickup_location_num"] = pd.to_numeric(df["pickup_location_id"], errors="coerce") df["drop_location_num"] = pd.to_numeric(df["drop_location_id"], errors="coerce") df["location_diff"] = abs(df["pickup_location_num"] - df["drop_location_num"]) df["location_sum"] = df["pickup_location_num"] + df["drop_location_num"] df["location_product"] = df["pickup_location_num"] * df["drop_location_num"] # Frequency encoding df["pickup_freq"] = df["pickup_location_id"].map(pickup_freq).fillna(0) df["drop_freq"] = df["drop_location_id"].map(drop_freq).fillna(0) df["route_freq"] = df["route"].map(route_freq).fillna(0) df["pickup_count"] = df["pickup_location_id"].map(pickup_count).fillna(0) df["drop_count"] = df["drop_location_id"].map(drop_count).fillna(0) df["route_count"] = df["route"].map(route_count).fillna(0) # Peak-time interactions df["peak_trip_duration"] = df["is_peak_hour"] * df["expected_trip_duration"] df["weekend_trip_duration"] = df["is_weekend"] * df["expected_trip_duration"] df["night_trip_duration"] = df["is_night"] * df["expected_trip_duration"] # Remove raw timestamp df = df.drop(columns=["request_timestamp"], errors="ignore") return df
python
# Prepare train and test y = train["outcome"] test_ids = test["id"] X = train.drop(columns=["outcome", "id"]) X_test = test.drop(columns=["id"]) X = make_features(X) X_test = make_features(X_test) display(X.head()) print(X.shape, X_test.shape)
python
# Drop columns with too many missing values missing_ratio = X.isna().mean() drop_missing_cols = missing_ratio[missing_ratio > 0.50].index.tolist() print("Dropped missing columns:", drop_missing_cols) X = X.drop(columns=drop_missing_cols, errors="ignore") X_test = X_test.drop(columns=drop_missing_cols, errors="ignore")
python
# Drop constant columns constant_cols = [col for col in X.columns if X[col].nunique(dropna=False) <= 1] print("Dropped constant columns:", constant_cols) X = X.drop(columns=constant_cols, errors="ignore") X_test = X_test.drop(columns=constant_cols, errors="ignore")
python
# Column types numeric_features = X.select_dtypes(include=["int64", "float64", "int32", "float32"]).columns.tolist() categorical_features = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist() print("Numeric columns:", len(numeric_features)) print("Categorical columns:", len(categorical_features)) print(categorical_features)
python
# Robust OneHotEncoder for different sklearn versions try: ohe = OneHotEncoder( handle_unknown="ignore", min_frequency=3, sparse_output=False ) except TypeError: try: ohe = OneHotEncoder( handle_unknown="ignore", min_frequency=3, sparse=False ) except TypeError: ohe = OneHotEncoder( handle_unknown="ignore", sparse=False ) numeric_transformer = Pipeline(steps=[ ("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler()) ]) categorical_transformer = Pipeline(steps=[ ("imputer", SimpleImputer(strategy="most_frequent")), ("encoder", ohe) ]) preprocessor = ColumnTransformer( transformers=[ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features) ], remainder="drop" )
python
# Split for local validation X_train, X_valid, y_train, y_valid = train_test_split( X, y, test_size=0.20, random_state=42, stratify=y ) print(X_train.shape, X_valid.shape)
python
# Strong individual models extra_trees = ExtraTreesClassifier( n_estimators=900, max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features="sqrt", class_weight="balanced", random_state=42, n_jobs=-1 ) random_forest = RandomForestClassifier( n_estimators=700, max_depth=None, min_samples_split=3, min_samples_leaf=1, max_features="sqrt", class_weight="balanced_subsample", random_state=42, n_jobs=-1 ) hist_gb = HistGradientBoostingClassifier( max_iter=500, learning_rate=0.035, max_leaf_nodes=31, l2_regularization=0.01, random_state=42 ) grad_boost = GradientBoostingClassifier( n_estimators=350, learning_rate=0.045, max_depth=3, random_state=42 ) log_reg = LogisticRegression( max_iter=3000, class_weight="balanced", C=1.5, random_state=42 )
python
# Soft Voting Ensemble voting_model = VotingClassifier( estimators=[ ("extra_trees", extra_trees), ("random_forest", random_forest), ("hist_gb", hist_gb), ("grad_boost", grad_boost), ("log_reg", log_reg) ], voting="soft", weights=[4, 3, 3, 2, 1], n_jobs=-1 ) final_pipeline = Pipeline(steps=[ ("preprocessor", preprocessor), ("model", voting_model) ])
python
# Train and validate final_pipeline.fit(X_train, y_train) valid_preds = final_pipeline.predict(X_valid) valid_score = f1_score(y_valid, valid_preds, average="weighted") print("Validation Weighted F1:", valid_score) print() print(classification_report(y_valid, valid_preds)) display(pd.DataFrame( confusion_matrix(y_valid, valid_preds), index=sorted(y.unique()), columns=sorted(y.unique()) ))
python
# Cross-validation check cv = StratifiedKFold( n_splits=5, shuffle=True, random_state=42 ) cv_scores = cross_val_score( final_pipeline, X, y, cv=cv, scoring="f1_weighted", n_jobs=-1 ) print("CV scores:", cv_scores) print("Mean CV weighted F1:", cv_scores.mean()) print("Std CV weighted F1:", cv_scores.std())
python
# Train final model on all training data final_pipeline.fit(X, y)
python
# Predict test data test_preds = final_pipeline.predict(X_test) submission_df = pd.DataFrame({ "id": test_ids, "outcome": test_preds }) display(submission_df.head()) print(submission_df["outcome"].value_counts())
python
# Save final submission submission_df.to_csv("submissions.csv", index=False) print("submissions.csv created successfully") print(submission_df.shape)
python
# Final check check = pd.read_csv("submissions.csv") print(check.head()) print(check.columns.tolist()) print(check.shape) assert list(check.columns) == ["id", "outcome"] assert check.shape[0] == test.shape[0] assert check["outcome"].isin(["low", "medium", "high"]).all() print("Submission format is correct.")

If this runs too slowly, change these values:

python
extra_trees n_estimators=500 random_forest n_estimators=400 hist_gb max_iter=300 grad_boost n_estimators=250

This version should usually perform better than the previous one because it captures duration patterns, route patterns, pickup/drop popularity, peak-hour effects, and then blends multiple models instead of trusting only one.

Condividi questa Q&A