Prediksi untuk hasil selanjutnya menggunakan xgboo...
Creado el: 8 de junio de 2026
Respondido usando GPT-5.5 Thinking por Chat01
Creado el: 8 de junio de 2026
Respondido usando GPT-5.5 Thinking por Chat01
Prediksi untuk hasil selanjutnya menggunakan xgboost dan setelah prediksi buatkan scriptnya untuk google colab
pythonimport pandas as pd, os, json, textwrap, numpy as np path='/mnt/data/PROCESSED_DATA_READY_FOR_ML.csv' df=pd.read_csv(path) df.shape, df.head(), df.tail(), df.dtypes ((2103, 61), timestamp raw_number ... freq_digit_8_10 freq_digit_9_10 0 2026-02-10 21:30:00 6112 ... 0.0 0.0 1 2026-02-10 21:30:00 236 ... 0.0 0.0 2 2026-02-10 21:30:00 7341 ... 0.0 0.0 3 2026-02-11 03:30:00 9876 ... 0.0 0.0 4 2026-02-11 03:30:00 2992 ... 0.0 0.0 [5 rows x 61 columns], timestamp raw_number ... freq_digit_8_10 freq_digit_9_10 2098 2026-06-08 11:30:00 8949 ... 4.0 6.0 2099 2026-06-08 11:30:00 736 ... 4.0 6.0 2100 2026-06-08 13:30:00 5676 ... 3.0 5.0 2101 2026-06-08 13:30:00 8134 ... 4.0 4.0 2102 2026-06-08 13:30:00 4296 ... 4.0 4.0 [5 rows x 61 columns], timestamp object raw_number int64 d1 int64 d2 int64 d3 int64 ... freq_digit_5_10 float64 freq_digit_6_10 float64 freq_digit_7_10 float64 freq_digit_8_10 float64 freq_digit_9_10 float64 Length: 61, dtype: object)
pythonprint(df.columns.tolist()) print(df.describe(include='all').T.head(80).to_string()) print(df.isna().sum()[df.isna().sum()>0])
pythonprint(df.loc[:10, ['timestamp','raw_number','d1','d2','d3','d4','sum_digits']].to_string()) print(df.loc[df['raw_number']<1000, ['raw_number','d1','d2','d3','d4']].head(10).to_string()) print("duplicates timestamp count groups:", df.groupby('timestamp').size().value_counts().head()) print("last timestamp group") print(df[df['timestamp']==df['timestamp'].iloc[-1]][['timestamp','raw_number','d1','d2','d3','d4','sum_digits']])
pythondf['timestamp_dt']=pd.to_datetime(df['timestamp']) uniq=df['timestamp_dt'].drop_duplicates().sort_values() print(uniq.diff().value_counts().head(20)) print(uniq.tail(20).to_string(index=False)) print("unique hours", sorted(df['timestamp_dt'].dt.strftime('%H:%M').unique())) print("last per group") print(df.tail(15)[['timestamp','raw_number','d1','d2','d3','d4','sum_digits']].to_string())
pythonprint(df.loc[:5, ['raw_number','sum_digits','lag_1_raw','lag_2_raw','lag_1_sum','lag_2_sum','rolling_mean_sum_3','rolling_mean_sum_5','ema_sum_5']].to_string())
pythonimport importlib.util, sys, subprocess, pkgutil, os, pandas as pd, numpy as np print("xgboost installed?", importlib.util.find_spec("xgboost") is not None)
pythonimport pandas as pd, numpy as np, math, os, json, textwrap, warnings, sys from datetime import timedelta from sklearn.metrics import accuracy_score, mean_absolute_error from sklearn.model_selection import train_test_split from xgboost import XGBClassifier path='/mnt/data/PROCESSED_DATA_READY_FOR_ML.csv' df=pd.read_csv(path) df['timestamp']=pd.to_datetime(df['timestamp']) df=df.sort_values(['timestamp']).reset_index(drop=True) # ensure stable row ordering? Within timestamp current order preserved by CSV. After sort maybe stable but reset good. pandas sort stable default quicksort? use mergesort df=pd.read_csv(path) df['timestamp']=pd.to_datetime(df['timestamp']) df['_orig_idx']=np.arange(len(df)) df=df.sort_values(['timestamp','_orig_idx'], kind='mergesort').reset_index(drop=True) df['draw_pos']=df.groupby('timestamp').cumcount() # Create raw_str and target digits from d1-d4 for c in ['d1','d2','d3','d4']: df[c]=df[c].astype(int) df['raw_str']=df[['d1','d2','d3','d4']].astype(str).agg(''.join, axis=1) df[['timestamp','raw_str','draw_pos']].head(), df[['timestamp','raw_str','draw_pos']].tail() ( timestamp raw_str draw_pos 0 2026-02-10 21:30:00 6112 0 1 2026-02-10 21:30:00 0236 1 2 2026-02-10 21:30:00 7341 2 3 2026-02-11 03:30:00 9876 0 4 2026-02-11 03:30:00 2992 1, timestamp raw_str draw_pos 2098 2026-06-08 11:30:00 8949 1 2099 2026-06-08 11:30:00 0736 2 2100 2026-06-08 13:30:00 5676 0 2101 2026-06-08 13:30:00 8134 1 2102 2026-06-08 13:30:00 4296 2)
pythonimport numpy as np, pandas as pd, math from collections import Counter from sklearn.metrics import accuracy_score from xgboost import XGBClassifier import warnings, os, json, textwrap warnings.filterwarnings("ignore") def next_timestamp_after(ts): # schedule inferred from the data schedule = [(3,30),(6,30),(11,30),(13,30),(17,30),(21,30)] date = ts.normalize() for h,m in schedule: cand = date + pd.Timedelta(hours=h, minutes=m) if cand > ts: return cand return date + pd.Timedelta(days=1, hours=schedule[0][0], minutes=schedule[0][1]) def timestamp_features(ts, draw_pos): hour_float = ts.hour + ts.minute/60 return { "draw_pos": draw_pos, "hour": ts.hour, "minute": ts.minute, "dayofweek": ts.dayofweek, "day": ts.day, "month": ts.month, "is_weekend": int(ts.dayofweek >= 5), "hour_sin": math.sin(2*math.pi*hour_float/24), "hour_cos": math.cos(2*math.pi*hour_float/24), "dow_sin": math.sin(2*math.pi*ts.dayofweek/7), "dow_cos": math.cos(2*math.pi*ts.dayofweek/7), "month_sin": math.sin(2*math.pi*ts.month/12), "month_cos": math.cos(2*math.pi*ts.month/12), "day_sin": math.sin(2*math.pi*ts.day/31), "day_cos": math.cos(2*math.pi*ts.day/31), } def compute_features_from_history(history_digits, history_ts, future_ts, draw_pos, max_lag=10): """ history_digits: list of [d1,d2,d3,d4] that happened before future row history_ts: list timestamps for history rows """ feats = timestamp_features(future_ts, draw_pos) n = len(history_digits) hist = np.array(history_digits, dtype=int) if n else np.empty((0,4), dtype=int) sums = hist.sum(axis=1) if n else np.array([]) raws = hist.dot(np.array([1000,100,10,1])) if n else np.array([]) # lags: most recent result is lag_1 for lag in range(1, max_lag+1): if n >= lag: digs = hist[-lag] feats[f"lag{lag}_raw"] = int(raws[-lag]) feats[f"lag{lag}_sum"] = int(sums[-lag]) for j in range(4): feats[f"lag{lag}_d{j+1}"] = int(digs[j]) else: feats[f"lag{lag}_raw"] = -1 feats[f"lag{lag}_sum"] = -1 for j in range(4): feats[f"lag{lag}_d{j+1}"] = -1 # rolling stats based on previous rows only for w in [3,5,10,20,50]: vals = sums[-w:] if n else np.array([]) feats[f"roll{w}_sum_mean"] = float(vals.mean()) if len(vals) else 0.0 feats[f"roll{w}_sum_std"] = float(vals.std(ddof=0)) if len(vals) else 0.0 feats[f"roll{w}_sum_min"] = float(vals.min()) if len(vals) else 0.0 feats[f"roll{w}_sum_max"] = float(vals.max()) if len(vals) else 0.0 # all-position digit frequency in last N rows for w in [10,20,50]: recent = hist[-w:].reshape(-1) if n else np.array([], dtype=int) counts = np.bincount(recent, minlength=10) if len(recent) else np.zeros(10, dtype=int) total = max(1, len(recent)) for d in range(10): feats[f"freq{w}_digit{d}"] = counts[d] / total # position-specific frequencies, last 20 w = 20 recent_pos = hist[-w:] if n else np.empty((0,4), dtype=int) for pos in range(4): vals = recent_pos[:, pos] if len(recent_pos) else np.array([], dtype=int) counts = np.bincount(vals, minlength=10) if len(vals) else np.zeros(10, dtype=int) total = max(1, len(vals)) for d in range(10): feats[f"freq20_pos{pos+1}_digit{d}"] = counts[d] / total # gap since each digit appeared anywhere flat_hist = hist.reshape(-1) if n else np.array([], dtype=int) # distance in rows since digit appeared; cap 999 for d in range(10): # inspect rows from the end gap = 999 for back in range(1, min(n, 100)+1): if d in hist[-back]: gap = back break feats[f"gap_digit{d}"] = gap return feats # Build training feature matrix history_digits, history_ts = [], [] rows = [] targets = [] meta = [] for idx, row in df.iterrows(): ts = row['timestamp'] dp = int(row['draw_pos']) feats = compute_features_from_history(history_digits, history_ts, ts, dp, max_lag=10) rows.append(feats) targets.append([int(row['d1']), int(row['d2']), int(row['d3']), int(row['d4'])]) meta.append((ts, dp, row['raw_str'])) history_digits.append([int(row['d1']), int(row['d2']), int(row['d3']), int(row['d4'])]) history_ts.append(ts) X = pd.DataFrame(rows).fillna(0) Y = pd.DataFrame(targets, columns=['d1','d2','d3','d4']) # drop first 50 rows due to incomplete lag/history start_idx = 50 X_model = X.iloc[start_idx:].reset_index(drop=True) Y_model = Y.iloc[start_idx:].reset_index(drop=True) meta_model = meta[start_idx:] len(X_model), X_model.shape, Y_model.shape (2053, (2053, 175), (2053, 4))
pythonsplit = int(len(X_model)*0.8) X_train, X_test = X_model.iloc[:split], X_model.iloc[split:] Y_train, Y_test = Y_model.iloc[:split], Y_model.iloc[split:] params = dict( objective="multi:softprob", num_class=10, n_estimators=180, max_depth=3, learning_rate=0.05, subsample=0.85, colsample_bytree=0.85, eval_metric="mlogloss", random_state=42, n_jobs=2, reg_lambda=1.5, ) models = {} metrics = {} probas_test = {} for col in ['d1','d2','d3','d4']: model = XGBClassifier(**params) model.fit(X_train, Y_train[col]) pred = model.predict(X_test) proba = model.predict_proba(X_test) models[col] = model probas_test[col] = proba metrics[col] = { "top1_acc": accuracy_score(Y_test[col], pred), "top3_acc": np.mean([Y_test[col].iloc[i] in np.argsort(proba[i])[-3:] for i in range(len(Y_test))]) } metrics {'d1': {'top1_acc': 0.10462287104622871, 'top3_acc': np.float64(0.31630170316301703)}, 'd2': {'top1_acc': 0.06326034063260341, 'top3_acc': np.float64(0.2895377128953771)}, 'd3': {'top1_acc': 0.07785888077858881, 'top3_acc': np.float64(0.25547445255474455)}, 'd4': {'top1_acc': 0.1070559610705596, 'top3_acc': np.float64(0.31386861313868614)}}
python# Refit on all available modeling data final_models = {} for col in ['d1','d2','d3','d4']: model = XGBClassifier(**params) model.fit(X_model, Y_model[col]) final_models[col]=model # forecast next 3 rows recursively history_digits = df[['d1','d2','d3','d4']].astype(int).values.tolist() history_ts = df['timestamp'].tolist() last_ts = df['timestamp'].iloc[-1] next_ts = next_timestamp_after(last_ts) forecasts = [] for dp in range(3): feats = compute_features_from_history(history_digits, history_ts, next_ts, dp, max_lag=10) X_next = pd.DataFrame([feats]).reindex(columns=X_model.columns, fill_value=0) digits=[] digit_info={} for col in ['d1','d2','d3','d4']: proba = final_models[col].predict_proba(X_next)[0] # XGB classes might be all 0-9, ensure align classes = final_models[col].classes_ # convert to full 10 full = np.zeros(10) for cls, p in zip(classes, proba): full[int(cls)] = p top = np.argsort(full)[::-1][:5] pred_digit = int(top[0]) digits.append(pred_digit) digit_info[col]=[(int(d), float(full[d])) for d in top] raw_str = ''.join(map(str,digits)) forecasts.append({"timestamp":next_ts, "draw_pos":dp, "predicted":raw_str, "digits":digits, "digit_info":digit_info}) # recursive append predicted as history for next row in same timestamp history_digits.append(digits) history_ts.append(next_ts) forecasts [{'timestamp': Timestamp('2026-06-08 17:30:00'), 'draw_pos': 0, 'predicted': '6326', 'digits': [6, 3, 2, 6], 'digit_info': {'d1': [(6, 0.17855720221996307), (5, 0.11627469211816788), (0, 0.11143366247415543), (2, 0.10549677163362503), (7, 0.09652142226696014)], 'd2': [(3, 0.15245047211647034), (7, 0.14461606740951538), (0, 0.12710347771644592), (1, 0.12532813847064972), (8, 0.12181589007377625)], 'd3': [(2, 0.20306669175624847), (0, 0.19056595861911774), (3, 0.12115106731653214), (9, 0.09936410933732986), (5, 0.08558907359838486)], 'd4': [(6, 0.146614208817482), (3, 0.13650761544704437), (8, 0.1351807415485382), (4, 0.12573111057281494), (2, 0.11719737201929092)]}}, {'timestamp': Timestamp('2026-06-08 17:30:00'), 'draw_pos': 1, 'predicted': '8333', 'digits': [8, 3, 3, 3], 'digit_info': {'d1': [(8, 0.2398628443479538), (4, 0.13556158542633057), (5, 0.12728619575500488), (2, 0.11167753487825394), (7, 0.0827118381857872)], 'd2': [(3, 0.1753360778093338), (8, 0.13878747820854187), (1, 0.13187645375728607), (6, 0.10360396653413773), (2, 0.09662382304668427)], 'd3': [(3, 0.1557879000902176), (2, 0.13867302238941193), (8, 0.12241671234369278), (9, 0.1097588986158371), (0, 0.10521094501018524)], 'd4': [(3, 0.1455286145210266), (9, 0.13309448957443237), (8, 0.12891900539398193), (6, 0.12755057215690613), (4, 0.09771434217691422)]}}, {'timestamp': Timestamp('2026-06-08 17:30:00'), 'draw_pos': 2, 'predicted': '7003', 'digits': [7, 0, 0, 3], 'digit_info': {'d1': [(7, 0.17086610198020935), (2, 0.1485922932624817), (8, 0.12388257682323456), (0, 0.10835260897874832), (6, 0.0954427421092987)], 'd2': [(0, 0.15440449118614197), (1, 0.15199314057826996), (2, 0.13727785646915436), (7, 0.13545693457126617), (3, 0.09616308659315109)], 'd3': [(0, 0.16490399837493896), (9, 0.1485580950975418), (2, 0.14743492007255554), (8, 0.11350205540657043), (3, 0.11339398473501205)], 'd4': [(3, 0.18991610407829285), (1, 0.13012440502643585), (6, 0.12870371341705322), (0, 0.11157738417387009), (4, 0.08606719970703125)]}}]
pythonavg_top1 = np.mean([v['top1_acc'] for v in metrics.values()]) avg_top3 = np.mean([v['top3_acc'] for v in metrics.values()]) avg_top1, avg_top3 (np.float64(0.08819951338199514), np.float64(0.2937956204379562))
pythonfrom pathlib import Path import json, textwrap, pandas as pd colab_code = r''' # ============================================================ # Prediksi Hasil Selanjutnya dengan XGBoost - Google Colab # Dataset: PROCESSED_DATA_READY_FOR_ML.csv # ============================================================ # 1) Install library !pip -q install xgboost # 2) Import library import math import warnings import numpy as np import pandas as pd from xgboost import XGBClassifier from sklearn.metrics import accuracy_score warnings.filterwarnings("ignore") # 3) Upload file CSV # Jalankan cell ini, lalu pilih file: PROCESSED_DATA_READY_FOR_ML.csv from google.colab import files uploaded = files.upload() csv_path = next(iter(uploaded.keys())) print("File yang dipakai:", csv_path) # 4) Load data df = pd.read_csv(csv_path) df["timestamp"] = pd.to_datetime(df["timestamp"]) df["_orig_idx"] = np.arange(len(df)) df = df.sort_values(["timestamp", "_orig_idx"], kind="mergesort").reset_index(drop=True) # Pastikan digit menjadi integer for c in ["d1", "d2", "d3", "d4"]: df[c] = df[c].astype(int) # Buat string angka 4 digit agar angka seperti 0236 tidak hilang nol depannya df["raw_str"] = df[["d1", "d2", "d3", "d4"]].astype(str).agg("".join, axis=1) # Karena setiap timestamp berisi 3 hasil, buat posisi hasil dalam timestamp: 0, 1, 2 df["draw_pos"] = df.groupby("timestamp").cumcount() print("Shape data:", df.shape) print("Timestamp terakhir:", df["timestamp"].iloc[-1]) print("3 hasil terakhir:") display(df.tail(3)[["timestamp", "draw_pos", "raw_str", "d1", "d2", "d3", "d4"]]) # 5) Fungsi jadwal berikutnya def next_timestamp_after(ts): """ Jadwal diinfer dari data: 03:30, 06:30, 11:30, 13:30, 17:30, 21:30. """ schedule = [(3,30), (6,30), (11,30), (13,30), (17,30), (21,30)] date = ts.normalize() for h, m in schedule: cand = date + pd.Timedelta(hours=h, minutes=m) if cand > ts: return cand return date + pd.Timedelta(days=1, hours=schedule[0][0], minutes=schedule[0][1]) # 6) Feature engineering tanpa data leakage def timestamp_features(ts, draw_pos): hour_float = ts.hour + ts.minute / 60 return { "draw_pos": draw_pos, "hour": ts.hour, "minute": ts.minute, "dayofweek": ts.dayofweek, "day": ts.day, "month": ts.month, "is_weekend": int(ts.dayofweek >= 5), "hour_sin": math.sin(2 * math.pi * hour_float / 24), "hour_cos": math.cos(2 * math.pi * hour_float / 24), "dow_sin": math.sin(2 * math.pi * ts.dayofweek / 7), "dow_cos": math.cos(2 * math.pi * ts.dayofweek / 7), "month_sin": math.sin(2 * math.pi * ts.month / 12), "month_cos": math.cos(2 * math.pi * ts.month / 12), "day_sin": math.sin(2 * math.pi * ts.day / 31), "day_cos": math.cos(2 * math.pi * ts.day / 31), } def compute_features_from_history(history_digits, future_ts, draw_pos, max_lag=10): """ history_digits berisi list digit [d1,d2,d3,d4] yang sudah terjadi SEBELUM baris yang diprediksi. Jadi fitur tidak memakai hasil saat ini. """ feats = timestamp_features(future_ts, draw_pos) n = len(history_digits) hist = np.array(history_digits, dtype=int) if n else np.empty((0, 4), dtype=int) sums = hist.sum(axis=1) if n else np.array([]) raws = hist.dot(np.array([1000, 100, 10, 1])) if n else np.array([]) # Lag 1 sampai 10 for lag in range(1, max_lag + 1): if n >= lag: digs = hist[-lag] feats[f"lag{lag}_raw"] = int(raws[-lag]) feats[f"lag{lag}_sum"] = int(sums[-lag]) for j in range(4): feats[f"lag{lag}_d{j+1}"] = int(digs[j]) else: feats[f"lag{lag}_raw"] = -1 feats[f"lag{lag}_sum"] = -1 for j in range(4): feats[f"lag{lag}_d{j+1}"] = -1 # Rolling statistik dari histori sebelumnya for w in [3, 5, 10, 20, 50]: vals = sums[-w:] if n else np.array([]) feats[f"roll{w}_sum_mean"] = float(vals.mean()) if len(vals) else 0.0 feats[f"roll{w}_sum_std"] = float(vals.std(ddof=0)) if len(vals) else 0.0 feats[f"roll{w}_sum_min"] = float(vals.min()) if len(vals) else 0.0 feats[f"roll{w}_sum_max"] = float(vals.max()) if len(vals) else 0.0 # Frekuensi digit semua posisi dalam N hasil terakhir for w in [10, 20, 50]: recent = hist[-w:].reshape(-1) if n else np.array([], dtype=int) counts = np.bincount(recent, minlength=10) if len(recent) else np.zeros(10, dtype=int) total = max(1, len(recent)) for d in range(10): feats[f"freq{w}_digit{d}"] = counts[d] / total # Frekuensi digit per posisi dalam 20 hasil terakhir recent_pos = hist[-20:] if n else np.empty((0, 4), dtype=int) for pos in range(4): vals = recent_pos[:, pos] if len(recent_pos) else np.array([], dtype=int) counts = np.bincount(vals, minlength=10) if len(vals) else np.zeros(10, dtype=int) total = max(1, len(vals)) for d in range(10): feats[f"freq20_pos{pos+1}_digit{d}"] = counts[d] / total # Gap: sudah berapa baris sejak digit muncul di posisi mana pun for d in range(10): gap = 999 for back in range(1, min(n, 100) + 1): if d in hist[-back]: gap = back break feats[f"gap_digit{d}"] = gap return feats # 7) Buat dataset supervised history_digits = [] feature_rows = [] target_rows = [] for _, row in df.iterrows(): ts = row["timestamp"] draw_pos = int(row["draw_pos"]) features = compute_features_from_history(history_digits, ts, draw_pos, max_lag=10) feature_rows.append(features) target_rows.append([int(row["d1"]), int(row["d2"]), int(row["d3"]), int(row["d4"])]) history_digits.append([int(row["d1"]), int(row["d2"]), int(row["d3"]), int(row["d4"])]) X = pd.DataFrame(feature_rows).fillna(0) Y = pd.DataFrame(target_rows, columns=["d1", "d2", "d3", "d4"]) # Buang awal data karena histori/lag belum cukup START_IDX = 50 X_model = X.iloc[START_IDX:].reset_index(drop=True) Y_model = Y.iloc[START_IDX:].reset_index(drop=True) print("X_model:", X_model.shape) print("Y_model:", Y_model.shape) # 8) Train-test split berdasarkan urutan waktu split = int(len(X_model) * 0.8) X_train, X_test = X_model.iloc[:split], X_model.iloc[split:] Y_train, Y_test = Y_model.iloc[:split], Y_model.iloc[split:] xgb_params = dict( objective="multi:softprob", num_class=10, n_estimators=180, max_depth=3, learning_rate=0.05, subsample=0.85, colsample_bytree=0.85, eval_metric="mlogloss", random_state=42, n_jobs=2, reg_lambda=1.5, ) # 9) Latih 4 model: satu model untuk tiap digit models = {} metrics = {} for col in ["d1", "d2", "d3", "d4"]: model = XGBClassifier(**xgb_params) model.fit(X_train, Y_train[col]) pred = model.predict(X_test) proba = model.predict_proba(X_test) top1_acc = accuracy_score(Y_test[col], pred) top3_acc = np.mean([ Y_test[col].iloc[i] in np.argsort(proba[i])[-3:] for i in range(len(Y_test)) ]) models[col] = model metrics[col] = {"top1_acc": top1_acc, "top3_acc": top3_acc} metrics_df = pd.DataFrame(metrics).T metrics_df.loc["AVERAGE"] = metrics_df.mean() display(metrics_df) # 10) Refit model memakai seluruh data untuk prediksi final final_models = {} for col in ["d1", "d2", "d3", "d4"]: model = XGBClassifier(**xgb_params) model.fit(X_model, Y_model[col]) final_models[col] = model # 11) Prediksi hasil selanjutnya def predict_next_results(n_results=3): """ n_results=3 karena pada data ini setiap timestamp berisi 3 hasil. Prediksi ke-2 dan ke-3 bersifat recursive: memakai hasil prediksi sebelumnya sebagai histori. """ history_digits = df[["d1", "d2", "d3", "d4"]].astype(int).values.tolist() last_ts = df["timestamp"].iloc[-1] next_ts = next_timestamp_after(last_ts) forecasts = [] for draw_pos in range(n_results): features = compute_features_from_history(history_digits, next_ts, draw_pos, max_lag=10) X_next = pd.DataFrame([features]).reindex(columns=X_model.columns, fill_value=0) predicted_digits = [] top_candidates = {} for col in ["d1", "d2", "d3", "d4"]: proba = final_models[col].predict_proba(X_next)[0] classes = final_models[col].classes_ # Ubah ke probabilitas penuh 0-9 full_proba = np.zeros(10) for cls, p in zip(classes, proba): full_proba[int(cls)] = p top5 = np.argsort(full_proba)[::-1][:5] predicted_digit = int(top5[0]) predicted_digits.append(predicted_digit) top_candidates[col] = [(int(d), float(full_proba[d])) for d in top5] pred_str = "".join(map(str, predicted_digits)) forecasts.append({ "timestamp": next_ts, "draw_pos": draw_pos, "predicted": pred_str, "top_candidates": top_candidates }) # Tambahkan prediksi ke histori untuk prediksi recursive berikutnya history_digits.append(predicted_digits) return forecasts forecasts = predict_next_results(n_results=3) print("\nPrediksi hasil selanjutnya:") for item in forecasts: print(f"timestamp={item['timestamp']} | draw_pos={item['draw_pos']} | prediksi={item['predicted']}") # Detail kandidat digit top-5 per posisi for item in forecasts: print("\n===================================") print(f"Prediksi: {item['predicted']} | timestamp={item['timestamp']} | draw_pos={item['draw_pos']}") for pos, candidates in item["top_candidates"].items(): print(pos, ":", [(d, round(p, 4)) for d, p in candidates]) # Simpan hasil prediksi ke CSV forecast_rows = [] for item in forecasts: row = { "timestamp": item["timestamp"], "draw_pos": item["draw_pos"], "predicted": item["predicted"], } for pos, candidates in item["top_candidates"].items(): row[f"{pos}_top5"] = str([(d, round(p, 4)) for d, p in candidates]) forecast_rows.append(row) forecast_df = pd.DataFrame(forecast_rows) forecast_df.to_csv("xgboost_next_predictions.csv", index=False) display(forecast_df) files.download("xgboost_next_predictions.csv") ''' py_path = Path("/mnt/data/xgboost_prediksi_colab.py") py_path.write_text(colab_code.strip() + "\n", encoding="utf-8") # Make .ipynb with logical cells cells = [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Prediksi Hasil Selanjutnya dengan XGBoost\n", "\n", "Notebook ini membaca `PROCESSED_DATA_READY_FOR_ML.csv`, membuat fitur berbasis histori sebelumnya, melatih XGBoost untuk memprediksi digit `d1` sampai `d4`, lalu memprediksi hasil berikutnya." ], }, ] # Split code into cells by major comments for readability parts = [] current = [] for line in colab_code.strip().splitlines(): if line.startswith("# ") and len(current) > 8 and any(line.startswith(f"# {i}") for i in range(1, 12)): parts.append("\n".join(current)) current = [line] else: current.append(line) if current: parts.append("\n".join(current)) for part in parts: cells.append({ "cell_type": "code", "execution_count": None, "metadata": {}, "outputs": [], "source": [line + "\n" for line in part.splitlines()], }) nb = { "cells": cells, "metadata": { "colab": {"provenance": []}, "kernelspec": {"name": "python3", "display_name": "Python 3"}, "language_info": {"name": "python"}, }, "nbformat": 4, "nbformat_minor": 5, } ipynb_path = Path("/mnt/data/xgboost_prediksi_colab.ipynb") ipynb_path.write_text(json.dumps(nb, ensure_ascii=False, indent=2), encoding="utf-8") # Also create a result CSV based on the prediction from analysis for convenience result_df = pd.DataFrame([ {"timestamp": "2026-06-08 17:30:00", "draw_pos": 0, "predicted": "6326"}, {"timestamp": "2026-06-08 17:30:00", "draw_pos": 1, "predicted": "8333"}, {"timestamp": "2026-06-08 17:30:00", "draw_pos": 2, "predicted": "7003"}, ]) result_path = Path("/mnt/data/xgboost_next_predictions_result.csv") result_df.to_csv(result_path, index=False) print("File dibuat:") print(py_path) print(ipynb_path) print(result_path)
Prediksi XGBoost dari data terakhir:
| Urutan | Timestamp prediksi | Prediksi |
|---|---|---|
| 1 | 2026-06-08 17:30:00 | 6326 |
| 2 | 2026-06-08 17:30:00 | 8333 |
| 3 | 2026-06-08 17:30:00 | 7003 |
Catatan penting: hasil validasi model rendah, rata-rata top-1 accuracy per digit ≈ 8,82% dan top-3 accuracy ≈ 29,38%. Jadi prediksi ini lebih cocok sebagai eksperimen ML, bukan jaminan akurat, karena data angka seperti ini cenderung sangat acak.
Saya sudah buatkan script Google Colab:
Script tersebut sudah mencakup upload CSV, feature engineering tanpa data leakage, training XGBoost untuk d1–d4, evaluasi, prediksi 3 hasil berikutnya, dan export hasil prediksi ke CSV.