give me jupyter Notebook code

Question

Answer 1

Use this notebook code as-is.

python
# ==============================
# Cell 1: Imports
# ==============================
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score

python
# ==============================
# Cell 2: Load data
# ==============================
train_path = "train.csv"
test_path = "test.csv"

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print("Train shape:", train_df.shape)
print("Test shape :", test_df.shape)

print("\nTrain columns:")
print(train_df.columns.tolist())

python
# ==============================
# Cell 3: Basic preprocessing
# ==============================
TARGET = "value_19"

def convert_time_to_seconds(df, col_name="time"):
    df = df.copy()
    if col_name in df.columns:
        # Convert HH:MM:SS -> total seconds
        df[col_name] = pd.to_timedelta(df[col_name], errors="coerce").dt.total_seconds()
    return df

train_df = convert_time_to_seconds(train_df, "time")
test_df = convert_time_to_seconds(test_df, "time")

# Keep original ids for output
test_ids = test_df["id"].copy()

# Drop id from features
X = train_df.drop(columns=[TARGET, "id"], errors="ignore")
y = train_df[TARGET]
X_test = test_df.drop(columns=["id"], errors="ignore")

print("Training features shape:", X.shape)
print("Target shape:", y.shape)
print("Test features shape:", X_test.shape)

python
# ==============================
# Cell 4: Train multiple models and compare with CV
# ==============================
models = {
    "ExtraTrees": Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("model", ExtraTreesRegressor(
            n_estimators=500,
            max_depth=None,
            min_samples_split=2,
            min_samples_leaf=1,
            random_state=42,
            n_jobs=-1
        ))
    ]),
    
    "RandomForest": Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("model", RandomForestRegressor(
            n_estimators=400,
            max_depth=None,
            min_samples_split=2,
            min_samples_leaf=1,
            random_state=42,
            n_jobs=-1
        ))
    ]),
    
    "GradientBoosting": Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("model", GradientBoostingRegressor(
            n_estimators=300,
            learning_rate=0.05,
            max_depth=4,
            random_state=42
        ))
    ]),
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = {}
for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=kf, scoring="r2", n_jobs=-1)
    cv_scores[name] = scores.mean()
    print(f"{name}: Mean R2 = {scores.mean():.5f} | Fold scores = {scores}")

python
# ==============================
# Cell 5: Fit best model
# ==============================
best_model_name = max(cv_scores, key=cv_scores.get)
best_model = models[best_model_name]

print("Best model:", best_model_name)
print("Best CV R2:", cv_scores[best_model_name])

best_model.fit(X, y)

python
# ==============================
# Cell 6: Predict on test data
# ==============================
test_preds = best_model.predict(X_test)

submission = pd.DataFrame({
    "id": test_ids,
    "value_19": test_preds
})

submission.head()

python
# ==============================
# Cell 7: Save output file
# ==============================
submission.to_csv("output.csv", index=False)
print("output.csv saved successfully")

python
# ==============================
# Cell 8: Optional check
# ==============================
print(submission.shape)
print(submission.head(10))

A slightly stronger version is this ensemble prediction cell. Replace Cell 5 and Cell 6 with this if you want better accuracy:

python
# ==============================
# Better Cell 5: Fit all models
# ==============================
for name, model in models.items():
    model.fit(X, y)

print("All models trained.")

python
# ==============================
# Better Cell 6: Ensemble prediction
# ==============================
pred_et = models["ExtraTrees"].predict(X_test)
pred_rf = models["RandomForest"].predict(X_test)
pred_gb = models["GradientBoosting"].predict(X_test)

# Weighted average
test_preds = (0.5 * pred_et) + (0.3 * pred_rf) + (0.2 * pred_gb)

submission = pd.DataFrame({
    "id": test_ids,
    "value_19": test_preds
})

submission.head()

If the platform rejects id, change only this line:

python
"id": test_ids,

to:

python
"ID": test_ids,

because your screenshot shows a small mismatch between sample output and note.

If you want, I can also format this into a cleaner full notebook with markdown headings.

give me jupyter Notebook code

问题

问题

思考

Generating Jupyter code based on user’s problem statement

回答

分享这个问答