give me jupyter Notebook code
생성일: 2026년 3월 30일
답변에 사용된 모델 GPT-5.4 Thinking by Chat01
생성일: 2026년 3월 30일
답변에 사용된 모델 GPT-5.4 Thinking by Chat01
give me jupyter Notebook code
give me jupyter Notebook code
Use this notebook code as-is.
python# ============================== # Cell 1: Imports # ============================== import warnings warnings.filterwarnings("ignore") import numpy as np import pandas as pd from sklearn.model_selection import KFold, cross_val_score from sklearn.impute import SimpleImputer from sklearn.pipeline import Pipeline from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor from sklearn.metrics import r2_score
python# ============================== # Cell 2: Load data # ============================== train_path = "train.csv" test_path = "test.csv" train_df = pd.read_csv(train_path) test_df = pd.read_csv(test_path) print("Train shape:", train_df.shape) print("Test shape :", test_df.shape) print("\nTrain columns:") print(train_df.columns.tolist())
python# ============================== # Cell 3: Basic preprocessing # ============================== TARGET = "value_19" def convert_time_to_seconds(df, col_name="time"): df = df.copy() if col_name in df.columns: # Convert HH:MM:SS -> total seconds df[col_name] = pd.to_timedelta(df[col_name], errors="coerce").dt.total_seconds() return df train_df = convert_time_to_seconds(train_df, "time") test_df = convert_time_to_seconds(test_df, "time") # Keep original ids for output test_ids = test_df["id"].copy() # Drop id from features X = train_df.drop(columns=[TARGET, "id"], errors="ignore") y = train_df[TARGET] X_test = test_df.drop(columns=["id"], errors="ignore") print("Training features shape:", X.shape) print("Target shape:", y.shape) print("Test features shape:", X_test.shape)
python# ============================== # Cell 4: Train multiple models and compare with CV # ============================== models = { "ExtraTrees": Pipeline([ ("imputer", SimpleImputer(strategy="median")), ("model", ExtraTreesRegressor( n_estimators=500, max_depth=None, min_samples_split=2, min_samples_leaf=1, random_state=42, n_jobs=-1 )) ]), "RandomForest": Pipeline([ ("imputer", SimpleImputer(strategy="median")), ("model", RandomForestRegressor( n_estimators=400, max_depth=None, min_samples_split=2, min_samples_leaf=1, random_state=42, n_jobs=-1 )) ]), "GradientBoosting": Pipeline([ ("imputer", SimpleImputer(strategy="median")), ("model", GradientBoostingRegressor( n_estimators=300, learning_rate=0.05, max_depth=4, random_state=42 )) ]), } kf = KFold(n_splits=5, shuffle=True, random_state=42) cv_scores = {} for name, model in models.items(): scores = cross_val_score(model, X, y, cv=kf, scoring="r2", n_jobs=-1) cv_scores[name] = scores.mean() print(f"{name}: Mean R2 = {scores.mean():.5f} | Fold scores = {scores}")
python# ============================== # Cell 5: Fit best model # ============================== best_model_name = max(cv_scores, key=cv_scores.get) best_model = models[best_model_name] print("Best model:", best_model_name) print("Best CV R2:", cv_scores[best_model_name]) best_model.fit(X, y)
python# ============================== # Cell 6: Predict on test data # ============================== test_preds = best_model.predict(X_test) submission = pd.DataFrame({ "id": test_ids, "value_19": test_preds }) submission.head()
python# ============================== # Cell 7: Save output file # ============================== submission.to_csv("output.csv", index=False) print("output.csv saved successfully")
python# ============================== # Cell 8: Optional check # ============================== print(submission.shape) print(submission.head(10))
A slightly stronger version is this ensemble prediction cell. Replace Cell 5 and Cell 6 with this if you want better accuracy:
python# ============================== # Better Cell 5: Fit all models # ============================== for name, model in models.items(): model.fit(X, y) print("All models trained.")
python# ============================== # Better Cell 6: Ensemble prediction # ============================== pred_et = models["ExtraTrees"].predict(X_test) pred_rf = models["RandomForest"].predict(X_test) pred_gb = models["GradientBoosting"].predict(X_test) # Weighted average test_preds = (0.5 * pred_et) + (0.3 * pred_rf) + (0.2 * pred_gb) submission = pd.DataFrame({ "id": test_ids, "value_19": test_preds }) submission.head()
If the platform rejects id, change only this line:
python"id": test_ids,
to:
python"ID": test_ids,
because your screenshot shows a small mismatch between sample output and note.
If you want, I can also format this into a cleaner full notebook with markdown headings.