give me jupyter notebook code for this given quest...

Question

give me jupyter notebook code for this given question and it should predict value of 19 and given task with specigic accurcy on my output.csc file now give me final code

Answer 1

Use this as your final Jupyter notebook code.

It is built to:

read train.csv and test.csv
clean both train and test
convert time into numeric seconds
engineer extra useful features
train multiple regressors
choose/blend the best ones using CV R2
save final predictions to output.csv

python
# =========================
# 1. Import libraries
# =========================
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import (
    RandomForestRegressor,
    ExtraTreesRegressor,
    GradientBoostingRegressor,
    HistGradientBoostingRegressor
)
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler

# =========================
# 2. Read datasets
# =========================
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

print("Train shape:", train.shape)
print("Test shape :", test.shape)
print("\nTrain columns:\n", list(train.columns))
print("\nTest columns:\n", list(test.columns))

# Clean column names
train.columns = train.columns.str.strip()
test.columns = test.columns.str.strip()

# =========================
# 3. Helper functions
# =========================
def time_to_seconds(x):
    """
    Convert HH:MM:SS to total seconds.
    Handles values like 0:00:45, 6:15:30, etc.
    """
    if pd.isna(x):
        return np.nan
    
    # if already numeric, return as-is
    if isinstance(x, (int, float, np.integer, np.floating)):
        return float(x)
    
    x = str(x).strip()
    if x == "":
        return np.nan
    
    parts = x.split(":")
    try:
        parts = [float(p) for p in parts]
        if len(parts) == 3:
            h, m, s = parts
            return h * 3600 + m * 60 + s
        elif len(parts) == 2:
            m, s = parts
            return m * 60 + s
        elif len(parts) == 1:
            return parts[0]
        else:
            return np.nan
    except:
        return np.nan


def preprocess(df, is_train=True):
    df = df.copy()

    # Detect ID column
    id_col = None
    if "id" in df.columns:
        id_col = "id"
    elif "ID" in df.columns:
        id_col = "ID"

    # Preserve original ID
    original_id = df[id_col].copy() if id_col is not None else pd.Series(range(len(df)))

    # Extract numeric part from ID for modeling
    if id_col is not None:
        df["id_num"] = (
            df[id_col]
            .astype(str)
            .str.extract(r"(\d+)", expand=False)
            .astype(float)
        )

    # Convert time column
    if "time" in df.columns:
        df["time_seconds"] = df["time"].apply(time_to_seconds)

        # Additional time features
        df["time_hours"] = df["time_seconds"] // 3600
        df["time_minutes"] = (df["time_seconds"] % 3600) // 60
        df["time_only_seconds"] = df["time_seconds"] % 60

        df.drop(columns=["time"], inplace=True)

    # Numeric conversion for epoch columns and sensor columns
    for col in df.columns:
        if col not in ["id", "ID"]:
            df[col] = pd.to_numeric(df[col], errors="coerce")

    # Feature engineering from epochs
    if "end_epoch" in df.columns and "start_epoch" in df.columns:
        df["epoch_diff"] = df["end_epoch"] - df["start_epoch"]
        df["start_epoch_mod_day"] = df["start_epoch"] % 86400
        df["end_epoch_mod_day"] = df["end_epoch"] % 86400

    # Row-wise sensor summary stats
    sensor_cols = [c for c in df.columns if c.startswith("value_") and c != "value_19"]
    if len(sensor_cols) > 0:
        df["sensor_mean"] = df[sensor_cols].mean(axis=1)
        df["sensor_std"] = df[sensor_cols].std(axis=1)
        df["sensor_min"] = df[sensor_cols].min(axis=1)
        df["sensor_max"] = df[sensor_cols].max(axis=1)
        df["sensor_median"] = df[sensor_cols].median(axis=1)

    return df, original_id


# =========================
# 4. Preprocess train and test
# =========================
train_processed, train_ids = preprocess(train, is_train=True)
test_processed, test_ids = preprocess(test, is_train=False)

# Target column
target_col = "value_19"
if target_col not in train_processed.columns:
    raise ValueError("value_19 column not found in train.csv")

y = train_processed[target_col].copy()

# Drop target from features
X = train_processed.drop(columns=[target_col], errors="ignore")

# Drop raw ID columns from model features
X = X.drop(columns=["id", "ID"], errors="ignore")
X_test = test_processed.drop(columns=["id", "ID"], errors="ignore")

# Align columns
common_cols = sorted(list(set(X.columns).intersection(set(X_test.columns))))
X = X[common_cols].copy()
X_test = X_test[common_cols].copy()

print("\nFinal training features:", X.shape)
print("Final testing features :", X_test.shape)

# =========================
# 5. Models
# =========================
cv = KFold(n_splits=5, shuffle=True, random_state=42)

models = {
    "extra_trees": Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("model", ExtraTreesRegressor(
            n_estimators=1200,
            max_features="sqrt",
            min_samples_leaf=1,
            random_state=42,
            n_jobs=-1
        ))
    ]),
    
    "random_forest": Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("model", RandomForestRegressor(
            n_estimators=1000,
            max_features=0.8,
            min_samples_leaf=1,
            random_state=42,
            n_jobs=-1
        ))
    ]),
    
    "gradient_boosting": Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("model", GradientBoostingRegressor(
            n_estimators=500,
            learning_rate=0.03,
            max_depth=3,
            subsample=0.9,
            random_state=42
        ))
    ]),
    
    "hist_gb": Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("model", HistGradientBoostingRegressor(
            learning_rate=0.03,
            max_depth=8,
            max_iter=600,
            l2_regularization=0.05,
            random_state=42
        ))
    ]),
    
    "ridge": Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        ("model", Ridge(alpha=3.0))
    ])
}

# =========================
# 6. Cross-validation
# =========================
results = []
test_predictions = {}

print("\nModel CV scores (R2):\n")

for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=cv, scoring="r2", n_jobs=-1)
    mean_score = scores.mean()
    std_score = scores.std()

    print(f"{name:18s} -> Mean R2: {mean_score:.5f} | Std: {std_score:.5f}")

    # Fit on full data and predict test
    model.fit(X, y)
    test_pred = model.predict(X_test)

    results.append((name, mean_score))
    test_predictions[name] = test_pred

# Sort by best CV score
results = sorted(results, key=lambda x: x[1], reverse=True)
print("\nSorted leaderboard:")
for r in results:
    print(r)

# =========================
# 7. Blend best models
# =========================
best_score = results[0][1]

# Keep models close to best score
selected_models = [(name, score) for name, score in results if score >= best_score - 0.03]

# If only one model survives, still use it
if len(selected_models) == 0:
    selected_models = [results[0]]

print("\nSelected models for blending:")
for name, score in selected_models:
    print(f"{name}: {score:.5f}")

# Convert negative/very small scores into safe positive weights
weights = np.array([max(score, 0.0001) for _, score in selected_models], dtype=float)
weights = weights / weights.sum()

final_pred = np.zeros(len(X_test))
for (name, score), w in zip(selected_models, weights):
    final_pred += w * test_predictions[name]

# Optional clipping if target values are sensor-like and extreme noise appears
# final_pred = np.clip(final_pred, y.min(), y.max())

# =========================
# 8. Create output.csv
# =========================
# Screenshot note says the output columns should be exactly: ID and value_19
submission = pd.DataFrame({
    "ID": test_ids,
    "value_19": final_pred
})

submission.to_csv("output.csv", index=False)

# Backup file in case platform actually expects lowercase id
submission_lower = submission.rename(columns={"ID": "id"})
submission_lower.to_csv("output_lowercase_id.csv", index=False)

print("\nDone.")
print("Saved: output.csv")
print("Also saved backup: output_lowercase_id.csv")
print("\nPreview:")
print(submission.head())

A couple of important points from your screenshots:

target to predict is value_19
clean both train and test
convert time from HH:MM:SS to numeric
save final file as output.csv
I used ID in output.csv because your screenshot note says exact columns should be ID and value_19
I also saved output_lowercase_id.csv as backup because the sample image shows lowercase id

If your platform rejects output.csv because of header name, change only this part:

python
submission = pd.DataFrame({
    "id": test_ids,
    "value_19": final_pred
})

and save again.

give me jupyter notebook code for this given quest...

Вопрос

Вопрос

Python

Python

Мысль

Creating code for dataset processing and saving outputs

Clarifying accuracy and final code limitations

Ответ

Поделиться этим Q&A