Ubah data saya agar bisa di cari algoritmanya dan ...
Erstellt am: 2. Juni 2026
Beantwortet mit GPT-5.5 Thinking von Chat01
Erstellt am: 2. Juni 2026
Beantwortet mit GPT-5.5 Thinking von Chat01
Ubah data saya agar bisa di cari algoritmanya dan rumus.nya untuk hasil selanjutnya dan buatkan scriptnya untuk di jalankan di google colab
pythonfrom pathlib import Path text = Path('/mnt/data/PRIZE_2_ONLY.txt').read_text(encoding='utf-8', errors='replace') len(text), text[:500] (61733, '=== HASIL EKSTRAKSI DATA PRIZE 2 ===\n\n=== HASIL EKSTRAKSI DATA PRIZE ===\n6112 0, 3, 4, 5, 7, 8, 9\n------------------------------\nTanggal: 11 Februari 2026 | Waktu: 03:30\n2992 0, 1, 3, 4, 5, 6, 7, 8\n------------------------------\nTanggal: 11 Februari 2026 | Waktu: 06:30\n9443 0, 1, 2, 5, 6, 7, 8\n------------------------------\nTanggal: 11 Februari 2026 | Waktu: 11:30\n8459 0, 1, 2, 3, 6, 7\n------------------------------\nTanggal: 11 Februari 2026 | Waktu: 13:30\n6407 1, 2, 3, 5, 8, 9\n------------')
pythonimport re, pandas as pd, numpy as np months = { 'Januari':'01','Februari':'02','Maret':'03','April':'04','Mei':'05','Juni':'06', 'Juli':'07','Agustus':'08','September':'09','Oktober':'10','November':'11','Desember':'12' } pattern = re.compile(r'(?m)^(\d{4})\s+([0-9,\s]+)\n-+\nTanggal:\s*(\d{2})\s+(\w+)\s+(\d{4})\s+\|\s+Waktu:\s*(\d{2}:\d{2})') records=[] for m in pattern.finditer(text): result, miss_str, dd, mon, yy, time = m.groups() dt = f"{yy}-{months.get(mon, '??')}-{dd} {time}" missing = [int(x.strip()) for x in miss_str.split(',') if x.strip()] records.append((dt, result, missing)) len(records), records[:3], records[-3:] (662, [('2026-02-11 03:30', '6112', [0, 3, 4, 5, 7, 8, 9]), ('2026-02-11 06:30', '2992', [0, 1, 3, 4, 5, 6, 7, 8]), ('2026-02-11 11:30', '9443', [0, 1, 2, 5, 6, 7, 8])], [('2026-06-01 21:30', '6249', [0, 1, 3, 5, 7, 8]), ('2026-06-02 03:30', '0324', [1, 5, 6, 7, 8, 9]), ('2026-06-02 06:30', '7426', [0, 1, 3, 5, 8, 9])])
pythonprint(text[-1000:])
pythonfor i, line in enumerate(text.splitlines()[:25], 1): print(i, repr(line))
pythondef parse_date_before(text): months = {'Januari':'01','Februari':'02','Maret':'03','April':'04','Mei':'05','Juni':'06','Juli':'07','Agustus':'08','September':'09','Oktober':'10','November':'11','Desember':'12'} current_dt = None records = [] orphan=[] for line in text.splitlines(): m = re.match(r'Tanggal:\s*(\d{2})\s+(\w+)\s+(\d{4})\s+\|\s+Waktu:\s*(\d{2}:\d{2})', line) if m: dd, mon, yy, tm = m.groups() current_dt = f"{yy}-{months[mon]}-{dd} {tm}" continue m = re.match(r'^(\d{4})\s+([0-9,\s]+)$', line.strip()) if m: result, miss_str = m.groups() miss = [int(x) for x in re.findall(r'\d', miss_str)] if current_dt is None: orphan.append((None, result, miss)) else: records.append((current_dt, result, miss)) return orphan, records orphan, rec_before = parse_date_before(text) len(orphan), orphan[:1], len(rec_before), rec_before[:3], rec_before[-3:] (1, [(None, '6112', [0, 3, 4, 5, 7, 8, 9])], 662, [('2026-02-11 03:30', '2992', [0, 1, 3, 4, 5, 6, 7, 8]), ('2026-02-11 06:30', '9443', [0, 1, 2, 5, 6, 7, 8]), ('2026-02-11 11:30', '8459', [0, 1, 2, 3, 6, 7])], [('2026-06-01 21:30', '0324', [1, 5, 6, 7, 8, 9]), ('2026-06-02 03:30', '7426', [0, 1, 3, 5, 8, 9]), ('2026-06-02 06:30', '7738', [0, 1, 2, 4, 5, 6, 9])])
pythonfrom pathlib import Path script = r''' # ============================================================ # PRIZE 2 - Parser, Analisis, Formula Skor, dan Prediksi Probabilistik # Cocok dijalankan di Google Colab. # # Cara pakai di Colab: # 1) Upload file PRIZE_2_ONLY.txt ke Colab. # 2) Jalankan semua cell / script ini. # 3) Output: prize2_clean.csv, prize2_features.csv, prize2_next_predictions.csv # # Catatan penting: # Data undian bersifat acak. Script ini mencari pola historis dan membuat # prediksi probabilistik, BUKAN rumus pasti dan BUKAN jaminan hasil. # ============================================================ import re import os import itertools from pathlib import Path import numpy as np import pandas as pd from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score # ------------------------------------------------------------ # KONFIGURASI # ------------------------------------------------------------ FILE_PATH = "PRIZE_2_ONLY.txt" # Format file Anda terlihat seperti: # Tanggal: ... | Waktu: ... # 2992 0, 1, 3, ... # Jadi default-nya tanggal dianggap milik angka SETELAH baris tanggal. # Jika ternyata tanggal di file Anda adalah milik angka SEBELUM baris tanggal, # ubah menjadi "date_after_result". DATE_MODE = "date_before_result" # pilihan: "date_before_result" atau "date_after_result" NEXT_N_DRAWS = 6 # jumlah jadwal berikutnya yang ingin diprediksi TOP_N_NUMBERS = 20 # jumlah kandidat 4D yang ditampilkan per jadwal RANDOM_STATE = 42 DRAW_TIMES = ["03:30", "06:30", "11:30", "13:30", "17:30", "21:30"] MONTHS_ID = { "Januari": 1, "Februari": 2, "Maret": 3, "April": 4, "Mei": 5, "Juni": 6, "Juli": 7, "Agustus": 8, "September": 9, "Oktober": 10, "November": 11, "Desember": 12 } ALL_DIGITS = set(range(10)) # ------------------------------------------------------------ # UPLOAD OTOMATIS JIKA DI GOOGLE COLAB # ------------------------------------------------------------ def resolve_file_path(default_path=FILE_PATH): path = Path(default_path) if path.exists(): return str(path) # Jika dijalankan di Colab, minta upload file. try: from google.colab import files print("File belum ditemukan. Silakan upload file TXT Anda...") uploaded = files.upload() if not uploaded: raise FileNotFoundError("Tidak ada file yang diupload.") first_name = next(iter(uploaded.keys())) return first_name except Exception as e: raise FileNotFoundError( f"File '{default_path}' tidak ditemukan. Upload file ke Colab atau ubah FILE_PATH. Error: {e}" ) # ------------------------------------------------------------ # PARSER # ------------------------------------------------------------ def parse_missing_list(s): return [int(x) for x in re.findall(r"\d", str(s))] def parse_date_line(line): m = re.search(r"Tanggal:\s*(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})\s+\|\s+Waktu:\s*(\d{2}:\d{2})", line) if not m: return None day, month_name, year, hhmm = m.groups() month = MONTHS_ID.get(month_name) if month is None: raise ValueError(f"Nama bulan tidak dikenali: {month_name}") return pd.to_datetime(f"{year}-{month:02d}-{int(day):02d} {hhmm}") def parse_result_line(line): m = re.match(r"^\s*(\d{4})\s+([0-9,\s]+)\s*$", line) if not m: return None result = m.group(1) missing_given = parse_missing_list(m.group(2)) return result, missing_given def parse_prize_txt(text, date_mode="date_before_result"): lines = text.splitlines() rows = [] if date_mode == "date_before_result": current_dt = None orphan_results = [] for line in lines: dt = parse_date_line(line) if dt is not None: current_dt = dt continue parsed = parse_result_line(line) if parsed is None: continue result, missing_given = parsed if current_dt is None: orphan_results.append({"datetime": pd.NaT, "result": result, "missing_given": missing_given}) else: rows.append({"datetime": current_dt, "result": result, "missing_given": missing_given}) if orphan_results: print(f"Peringatan: {len(orphan_results)} baris angka tidak punya tanggal dan dilewati. Contoh: {orphan_results[0]}") elif date_mode == "date_after_result": # Tanggal dianggap milik angka sebelumnya. pattern = re.compile( r"(?m)^(\d{4})\s+([0-9,\s]+)\n-+\nTanggal:\s*(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})\s+\|\s+Waktu:\s*(\d{2}:\d{2})" ) for m in pattern.finditer(text): result, miss_str, day, month_name, year, hhmm = m.groups() month = MONTHS_ID.get(month_name) if month is None: raise ValueError(f"Nama bulan tidak dikenali: {month_name}") dt = pd.to_datetime(f"{year}-{month:02d}-{int(day):02d} {hhmm}") rows.append({"datetime": dt, "result": result, "missing_given": parse_missing_list(miss_str)}) else: raise ValueError("date_mode harus 'date_before_result' atau 'date_after_result'.") df = pd.DataFrame(rows) if df.empty: raise ValueError("Tidak ada data yang berhasil diparse. Cek format file atau DATE_MODE.") df = df.drop_duplicates(subset=["datetime", "result"]).sort_values("datetime").reset_index(drop=True) # Pecah digit 4D: ribuan, ratusan, puluhan, satuan for i, col in enumerate(["d1_ribuan", "d2_ratusan", "d3_puluhan", "d4_satuan"]): df[col] = df["result"].str[i].astype(int) df["result_int"] = df["result"].astype(int) df["digit_sum"] = df[["d1_ribuan", "d2_ratusan", "d3_puluhan", "d4_satuan"]].sum(axis=1) df["odd_count"] = df[["d1_ribuan", "d2_ratusan", "d3_puluhan", "d4_satuan"]].apply(lambda r: sum(int(x % 2 == 1) for x in r), axis=1) df["unique_digit_count"] = df["result"].apply(lambda s: len(set(map(int, s)))) # Rumus daftar angka yang tidak muncul: # missing = {0,1,2,3,4,5,6,7,8,9} - unique digit hasil df["missing_calc"] = df["result"].apply(lambda s: sorted(list(ALL_DIGITS - set(map(int, s))))) df["missing_given_sorted"] = df["missing_given"].apply(lambda x: sorted(x)) df["missing_ok"] = df["missing_calc"].astype(str) == df["missing_given_sorted"].astype(str) df["missing_count"] = df["missing_calc"].apply(len) # Kolom waktu df["date"] = df["datetime"].dt.date.astype(str) df["time"] = df["datetime"].dt.strftime("%H:%M") df["hour"] = df["datetime"].dt.hour df["minute"] = df["datetime"].dt.minute df["day"] = df["datetime"].dt.day df["month"] = df["datetime"].dt.month df["dayofweek"] = df["datetime"].dt.dayofweek slot_map = {t: i for i, t in enumerate(DRAW_TIMES)} df["draw_slot"] = df["time"].map(slot_map).fillna(-1).astype(int) # One-hot missing digit: apakah digit X tidak muncul pada result for d in range(10): df[f"miss_{d}"] = df["missing_calc"].apply(lambda xs, d=d: int(d in xs)) return df # ------------------------------------------------------------ # FEATURE ENGINEERING TANPA LEAKAGE # ------------------------------------------------------------ DIGIT_COLS = ["d1_ribuan", "d2_ratusan", "d3_puluhan", "d4_satuan"] def build_features(df): df = df.sort_values("datetime").reset_index(drop=True).copy() feat = pd.DataFrame(index=df.index) # Fitur waktu sekarang feat["idx"] = np.arange(len(df)) for c in ["hour", "minute", "day", "month", "dayofweek", "draw_slot"]: feat[c] = df[c].astype(float) # Lag digit dan ringkasan hasil sebelumnya for lag in [1, 2, 3, 5, 10]: for col in DIGIT_COLS: feat[f"lag{lag}_{col}"] = df[col].shift(lag) feat[f"lag{lag}_digit_sum"] = df["digit_sum"].shift(lag) feat[f"lag{lag}_odd_count"] = df["odd_count"].shift(lag) feat[f"lag{lag}_unique_digit_count"] = df["unique_digit_count"].shift(lag) feat[f"lag{lag}_missing_count"] = df["missing_count"].shift(lag) # Rolling statistic hanya memakai masa lalu: shift(1) for win in [5, 10, 20, 50]: for col in DIGIT_COLS: shifted = df[col].shift(1) feat[f"roll{win}_{col}_mean"] = shifted.rolling(win, min_periods=2).mean() feat[f"roll{win}_{col}_std"] = shifted.rolling(win, min_periods=2).std() # Frekuensi digit per posisi dalam rolling window for win in [10, 30, 100]: for pos_col in DIGIT_COLS: shifted = df[pos_col].shift(1) for digit in range(10): feat[f"freq{win}_{pos_col}_{digit}"] = shifted.rolling(win, min_periods=2).apply( lambda x, digit=digit: np.mean(x == digit), raw=False ) # Target digit tiap posisi target = df[DIGIT_COLS].copy() # Isi nilai awal yang belum punya lag/rolling. feat = feat.fillna(-1) return feat, target # ------------------------------------------------------------ # FORMULA SKOR MANUAL: FREKUENSI + RECENCY + TIME SLOT + MARKOV # ------------------------------------------------------------ def normalize_prob(arr): arr = np.asarray(arr, dtype=float) arr = np.maximum(arr, 1e-12) return arr / arr.sum() def formula_position_probs(history_df, next_dt, alpha=0.40, beta=0.25, gamma=0.25, delta=0.10, decay=0.985, smoothing=1.0): """ Rumus skor digit per posisi: P_final(d) = alpha*P_global(d) + beta*P_recency(d) + gamma*P_time(d) + delta*P_markov(d) P_global = frekuensi digit sepanjang sejarah P_recency = frekuensi berbobot, data terbaru bobotnya lebih besar P_time = frekuensi pada slot jam yang sama P_markov = peluang digit sekarang d jika digit sebelumnya pada posisi yang sama = digit terakhir """ h = history_df.sort_values("datetime").reset_index(drop=True).copy() next_time = pd.to_datetime(next_dt).strftime("%H:%M") probs_by_pos = {} n = len(h) weights = decay ** np.arange(n - 1, -1, -1) # baris terbaru bobot terbesar for pos_col in DIGIT_COLS: values = h[pos_col].astype(int).values # Global p_global = np.array([(np.sum(values == d) + smoothing) / (len(values) + 10*smoothing) for d in range(10)]) # Recency weighted p_recency = [] for d in range(10): p_recency.append((np.sum(weights * (values == d)) + smoothing) / (np.sum(weights) + 10*smoothing)) p_recency = np.array(p_recency) # Same time slot same_slot = h[h["time"] == next_time] if len(same_slot) >= 5: sv = same_slot[pos_col].astype(int).values p_time = np.array([(np.sum(sv == d) + smoothing) / (len(sv) + 10*smoothing) for d in range(10)]) else: p_time = p_global.copy() # Markov sederhana: digit posisi sekarang tergantung digit posisi sebelumnya. if len(h) >= 2: last_digit = int(h[pos_col].iloc[-1]) prev_vals = h[pos_col].shift(1).values[1:].astype(int) curr_vals = h[pos_col].values[1:].astype(int) mask = prev_vals == last_digit if np.sum(mask) >= 3: mv = curr_vals[mask] p_markov = np.array([(np.sum(mv == d) + smoothing) / (len(mv) + 10*smoothing) for d in range(10)]) else: p_markov = p_global.copy() else: p_markov = p_global.copy() combined = alpha*p_global + beta*p_recency + gamma*p_time + delta*p_markov probs_by_pos[pos_col] = normalize_prob(combined) return probs_by_pos # ------------------------------------------------------------ # MODEL ML # ------------------------------------------------------------ def train_models(df): X, y = build_features(df) # Split time-series sederhana: 80% train, 20% test split = int(len(df) * 0.80) X_train, X_test = X.iloc[:split], X.iloc[split:] y_train, y_test = y.iloc[:split], y.iloc[split:] models = {} report_rows = [] for pos_col in DIGIT_COLS: model = RandomForestClassifier( n_estimators=400, random_state=RANDOM_STATE, min_samples_leaf=2, max_features="sqrt", n_jobs=-1 ) model.fit(X_train, y_train[pos_col]) models[pos_col] = model pred = model.predict(X_test) proba = model.predict_proba(X_test) classes = list(model.classes_) top3_ok = [] for i, true_digit in enumerate(y_test[pos_col].values): order = np.argsort(proba[i])[::-1][:3] top3_digits = [classes[j] for j in order] top3_ok.append(int(true_digit in top3_digits)) report_rows.append({ "posisi": pos_col, "akurasi_top1": accuracy_score(y_test[pos_col], pred), "akurasi_top3": float(np.mean(top3_ok)), "jumlah_test": len(X_test) }) # Refit pakai semua data untuk prediksi final final_models = {} for pos_col in DIGIT_COLS: model = RandomForestClassifier( n_estimators=400, random_state=RANDOM_STATE, min_samples_leaf=2, max_features="sqrt", n_jobs=-1 ) model.fit(X, y[pos_col]) final_models[pos_col] = model report = pd.DataFrame(report_rows) return final_models, report def align_model_proba(model, X_row): raw = model.predict_proba(X_row)[0] p = np.zeros(10, dtype=float) for cls, prob in zip(model.classes_, raw): p[int(cls)] = prob return normalize_prob(p) def next_draw_times(last_dt, n=6): last_dt = pd.to_datetime(last_dt) out = [] cursor_date = last_dt.date() last_time = last_dt.strftime("%H:%M") # cari jadwal setelah last_time pada tanggal yang sama all_times = DRAW_TIMES for _ in range(n): found = None for t in all_times: candidate = pd.to_datetime(f"{cursor_date} {t}") if candidate > last_dt: found = candidate break if found is None: cursor_date = (pd.to_datetime(cursor_date) + pd.Timedelta(days=1)).date() found = pd.to_datetime(f"{cursor_date} {all_times[0]}") out.append(found) last_dt = found cursor_date = last_dt.date() return out def make_future_feature_row(history_df, future_dt): # Tambahkan baris dummy. Fitur tidak boleh memakai digit/current result dari baris dummy. dummy = { "datetime": pd.to_datetime(future_dt), "result": "0000", "missing_given": list(range(1, 10)), "d1_ribuan": 0, "d2_ratusan": 0, "d3_puluhan": 0, "d4_satuan": 0, "result_int": 0, "digit_sum": 0, "odd_count": 0, "unique_digit_count": 1, "missing_calc": list(range(1,10)), "missing_given_sorted": list(range(1,10)), "missing_ok": True, "missing_count": 9, "date": pd.to_datetime(future_dt).date().isoformat(), "time": pd.to_datetime(future_dt).strftime("%H:%M"), "hour": pd.to_datetime(future_dt).hour, "minute": pd.to_datetime(future_dt).minute, "day": pd.to_datetime(future_dt).day, "month": pd.to_datetime(future_dt).month, "dayofweek": pd.to_datetime(future_dt).dayofweek, "draw_slot": {t:i for i,t in enumerate(DRAW_TIMES)}.get(pd.to_datetime(future_dt).strftime("%H:%M"), -1), } for d in range(10): dummy[f"miss_{d}"] = int(d != 0) temp = pd.concat([history_df, pd.DataFrame([dummy])], ignore_index=True) X, _ = build_features(temp) return X.tail(1) def candidates_from_position_probs(position_probs, top_k_each_pos=6, top_n=20): pos_order = DIGIT_COLS top_digits = [] for pos_col in pos_order: p = position_probs[pos_col] order = np.argsort(p)[::-1][:top_k_each_pos] top_digits.append(order.tolist()) rows = [] for combo in itertools.product(*top_digits): score = 1.0 for pos_col, digit in zip(pos_order, combo): score *= float(position_probs[pos_col][digit]) number = "".join(map(str, combo)) missing = sorted(list(ALL_DIGITS - set(combo))) rows.append({ "candidate": number, "score": score, "missing_digits": ",".join(map(str, missing)) }) out = pd.DataFrame(rows).sort_values("score", ascending=False).head(top_n).reset_index(drop=True) out["rank"] = np.arange(1, len(out)+1) out["score_pct"] = out["score"] / out["score"].sum() * 100 return out[["rank", "candidate", "score", "score_pct", "missing_digits"]] def combine_probs(rf_probs, formula_probs, rf_weight=0.60, formula_weight=0.40): combined = {} for pos_col in DIGIT_COLS: combined[pos_col] = normalize_prob(rf_weight*np.array(rf_probs[pos_col]) + formula_weight*np.array(formula_probs[pos_col])) return combined def predict_next(df, models, n_draws=6, top_n_numbers=20): history = df.copy() future_times = next_draw_times(history["datetime"].max(), n=n_draws) all_prediction_rows = [] for future_dt in future_times: X_row = make_future_feature_row(history, future_dt) rf_probs = {pos_col: align_model_proba(models[pos_col], X_row) for pos_col in DIGIT_COLS} formula_probs = formula_position_probs(history, future_dt) final_probs = combine_probs(rf_probs, formula_probs) cand = candidates_from_position_probs(final_probs, top_k_each_pos=6, top_n=top_n_numbers) cand.insert(0, "predicted_datetime", pd.to_datetime(future_dt)) all_prediction_rows.append(cand) # Untuk prediksi multi-jadwal, kandidat rank 1 dipakai sebagai history sementara. top_number = cand.iloc[0]["candidate"] new_row = { "datetime": pd.to_datetime(future_dt), "result": top_number, "missing_given": sorted(list(ALL_DIGITS - set(map(int, top_number)))) } temp_df = parse_prize_txt("", date_mode="date_before_result") if False else None # placeholder agar struktur jelas add = pd.DataFrame([new_row]) for i, col in enumerate(DIGIT_COLS): add[col] = add["result"].str[i].astype(int) add["result_int"] = add["result"].astype(int) add["digit_sum"] = add[DIGIT_COLS].sum(axis=1) add["odd_count"] = add[DIGIT_COLS].apply(lambda r: sum(int(x % 2 == 1) for x in r), axis=1) add["unique_digit_count"] = add["result"].apply(lambda s: len(set(map(int, s)))) add["missing_calc"] = add["result"].apply(lambda s: sorted(list(ALL_DIGITS - set(map(int, s))))) add["missing_given_sorted"] = add["missing_given"] add["missing_ok"] = True add["missing_count"] = add["missing_calc"].apply(len) add["date"] = add["datetime"].dt.date.astype(str) add["time"] = add["datetime"].dt.strftime("%H:%M") add["hour"] = add["datetime"].dt.hour add["minute"] = add["datetime"].dt.minute add["day"] = add["datetime"].dt.day add["month"] = add["datetime"].dt.month add["dayofweek"] = add["datetime"].dt.dayofweek add["draw_slot"] = add["time"].map({t:i for i,t in enumerate(DRAW_TIMES)}).fillna(-1).astype(int) for d in range(10): add[f"miss_{d}"] = add["missing_calc"].apply(lambda xs, d=d: int(d in xs)) history = pd.concat([history, add[history.columns]], ignore_index=True) return pd.concat(all_prediction_rows, ignore_index=True) # ------------------------------------------------------------ # MAIN # ------------------------------------------------------------ def main(): file_path = resolve_file_path(FILE_PATH) text = Path(file_path).read_text(encoding="utf-8", errors="replace") df = parse_prize_txt(text, date_mode=DATE_MODE) print(f"Jumlah baris terstruktur: {len(df)}") print(f"Rentang data: {df['datetime'].min()} sampai {df['datetime'].max()}") print(f"Cek rumus missing digit cocok: {df['missing_ok'].mean()*100:.2f}%") # Simpan data bersih df.to_csv("prize2_clean.csv", index=False) # Simpan fitur X, y = build_features(df) features = pd.concat([df[["datetime", "result"]], X, y.add_prefix("target_")], axis=1) features.to_csv("prize2_features.csv", index=False) # Train model models, report = train_models(df) report.to_csv("prize2_model_report.csv", index=False) print("\nEvaluasi model pada 20% data terakhir:") print(report.to_string(index=False)) # Prediksi jadwal berikutnya predictions = predict_next(df, models, n_draws=NEXT_N_DRAWS, top_n_numbers=TOP_N_NUMBERS) predictions.to_csv("prize2_next_predictions.csv", index=False) print("\nTop kandidat untuk jadwal berikutnya:") print(predictions.groupby("predicted_datetime").head(5).to_string(index=False)) print("\nFile output dibuat:") print("- prize2_clean.csv") print("- prize2_features.csv") print("- prize2_model_report.csv") print("- prize2_next_predictions.csv") return df, features, report, predictions if __name__ == "__main__": df, features, report, predictions = main() ''' open('/mnt/data/temp_script.py','w').write(script) len(script.splitlines()) 552
pythonscript2 = r''' # ============================================================ # PRIZE 2 - Data Cleaner, Formula Skor, dan Prediksi Probabilistik # Untuk Google Colab # # Cara pakai: # 1) Upload file PRIZE_2_ONLY.txt ke Colab. # 2) Jalankan script ini. # 3) Output: prize2_clean.csv, prize2_features.csv, # prize2_model_report.csv, prize2_next_predictions.csv # # Penting: # Data undian/angka acak tidak punya rumus pasti. Script ini hanya mencari # pola historis dan membuat kandidat probabilistik, bukan jaminan hasil. # ============================================================ import re import itertools from pathlib import Path import numpy as np import pandas as pd from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score # ----------------------- # KONFIGURASI # ----------------------- FILE_PATH = "PRIZE_2_ONLY.txt" # Default: baris tanggal dianggap milik angka SETELAH baris tanggal. # Jika Anda yakin tanggal di file adalah milik angka SEBELUM baris tanggal, # ubah ke "date_after_result". DATE_MODE = "date_before_result" # "date_before_result" atau "date_after_result" NEXT_N_DRAWS = 6 TOP_N_NUMBERS = 20 RANDOM_STATE = 42 DRAW_TIMES = ["03:30", "06:30", "11:30", "13:30", "17:30", "21:30"] DIGIT_COLS = ["d1_ribuan", "d2_ratusan", "d3_puluhan", "d4_satuan"] ALL_DIGITS = set(range(10)) MONTHS_ID = { "Januari": 1, "Februari": 2, "Maret": 3, "April": 4, "Mei": 5, "Juni": 6, "Juli": 7, "Agustus": 8, "September": 9, "Oktober": 10, "November": 11, "Desember": 12 } def resolve_file_path(default_path=FILE_PATH): path = Path(default_path) if path.exists(): return str(path) try: from google.colab import files print("File belum ditemukan. Silakan upload file TXT Anda...") uploaded = files.upload() if not uploaded: raise FileNotFoundError("Tidak ada file yang diupload.") return next(iter(uploaded.keys())) except Exception as e: raise FileNotFoundError( f"File '{default_path}' tidak ditemukan. Upload file ke Colab atau ubah FILE_PATH. Error: {e}" ) # ----------------------- # PARSER DATA # ----------------------- def parse_missing_list(s): return [int(x) for x in re.findall(r"\d", str(s))] def parse_date_line(line): m = re.search(r"Tanggal:\s*(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})\s+\|\s+Waktu:\s*(\d{2}:\d{2})", line) if not m: return None day, month_name, year, hhmm = m.groups() month = MONTHS_ID.get(month_name) if month is None: raise ValueError(f"Nama bulan tidak dikenali: {month_name}") return pd.to_datetime(f"{year}-{month:02d}-{int(day):02d} {hhmm}") def parse_result_line(line): m = re.match(r"^\s*(\d{4})\s+([0-9,\s]+)\s*$", line) if not m: return None return m.group(1), parse_missing_list(m.group(2)) def parse_prize_txt(text, date_mode=DATE_MODE): rows = [] if date_mode == "date_before_result": current_dt = None orphan_count = 0 for line in text.splitlines(): dt = parse_date_line(line) if dt is not None: current_dt = dt continue parsed = parse_result_line(line) if parsed is None: continue result, missing_given = parsed if current_dt is None: orphan_count += 1 else: rows.append({"datetime": current_dt, "result": result, "missing_given": missing_given}) if orphan_count: print(f"Peringatan: {orphan_count} angka tanpa timestamp dilewati. Ini normal jika baris pertama tidak punya tanggal.") elif date_mode == "date_after_result": pattern = re.compile( r"(?m)^(\d{4})\s+([0-9,\s]+)\n-+\nTanggal:\s*(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})\s+\|\s+Waktu:\s*(\d{2}:\d{2})" ) for m in pattern.finditer(text): result, miss_str, day, month_name, year, hhmm = m.groups() month = MONTHS_ID.get(month_name) if month is None: raise ValueError(f"Nama bulan tidak dikenali: {month_name}") dt = pd.to_datetime(f"{year}-{month:02d}-{int(day):02d} {hhmm}") rows.append({"datetime": dt, "result": result, "missing_given": parse_missing_list(miss_str)}) else: raise ValueError("DATE_MODE harus 'date_before_result' atau 'date_after_result'.") df = pd.DataFrame(rows) if df.empty: raise ValueError("Tidak ada data yang berhasil diparse. Cek format file atau DATE_MODE.") df = df.drop_duplicates(subset=["datetime", "result"]).sort_values("datetime").reset_index(drop=True) for i, col in enumerate(DIGIT_COLS): df[col] = df["result"].str[i].astype(int) df["result_int"] = df["result"].astype(int) df["digit_sum"] = df[DIGIT_COLS].sum(axis=1) df["odd_count"] = df[DIGIT_COLS].apply(lambda r: sum(int(x % 2 == 1) for x in r), axis=1) df["unique_digit_count"] = df["result"].apply(lambda s: len(set(map(int, s)))) # Rumus missing digit: # missing_digits = {0,1,2,3,4,5,6,7,8,9} - unique_digit(result) df["missing_calc"] = df["result"].apply(lambda s: sorted(list(ALL_DIGITS - set(map(int, s))))) df["missing_given_sorted"] = df["missing_given"].apply(lambda x: sorted(x)) df["missing_ok"] = df["missing_calc"].astype(str) == df["missing_given_sorted"].astype(str) df["missing_count"] = df["missing_calc"].apply(len) df["date"] = df["datetime"].dt.date.astype(str) df["time"] = df["datetime"].dt.strftime("%H:%M") df["hour"] = df["datetime"].dt.hour df["minute"] = df["datetime"].dt.minute df["day"] = df["datetime"].dt.day df["month"] = df["datetime"].dt.month df["dayofweek"] = df["datetime"].dt.dayofweek df["draw_slot"] = df["time"].map({t: i for i, t in enumerate(DRAW_TIMES)}).fillna(-1).astype(int) for d in range(10): df[f"miss_{d}"] = df["missing_calc"].apply(lambda xs, d=d: int(d in xs)) return df # ----------------------- # FEATURE ENGINEERING # ----------------------- def build_features(df): df = df.sort_values("datetime").reset_index(drop=True).copy() feats = {} feats["idx"] = np.arange(len(df)) for c in ["hour", "minute", "day", "month", "dayofweek", "draw_slot"]: feats[c] = df[c].astype(float) # Lag: model hanya melihat data masa lalu, bukan hasil saat ini. for lag in [1, 2, 3, 5, 10]: for col in DIGIT_COLS: feats[f"lag{lag}_{col}"] = df[col].shift(lag) feats[f"lag{lag}_digit_sum"] = df["digit_sum"].shift(lag) feats[f"lag{lag}_odd_count"] = df["odd_count"].shift(lag) feats[f"lag{lag}_unique_digit_count"] = df["unique_digit_count"].shift(lag) feats[f"lag{lag}_missing_count"] = df["missing_count"].shift(lag) # Rolling statistik dari masa lalu. for win in [5, 10, 20, 50]: for col in DIGIT_COLS: shifted = df[col].shift(1) feats[f"roll{win}_{col}_mean"] = shifted.rolling(win, min_periods=2).mean() feats[f"roll{win}_{col}_std"] = shifted.rolling(win, min_periods=2).std() # Rolling frekuensi digit sederhana, cepat, dan informatif. for win in [20, 100]: for col in DIGIT_COLS: shifted = df[col].shift(1) for digit in range(10): feats[f"freq{win}_{col}_{digit}"] = (shifted == digit).astype(float).rolling(win, min_periods=2).mean() X = pd.DataFrame(feats).fillna(-1) y = df[DIGIT_COLS].copy() return X, y # ----------------------- # FORMULA SKOR MANUAL # ----------------------- def normalize_prob(arr): arr = np.asarray(arr, dtype=float) arr = np.maximum(arr, 1e-12) return arr / arr.sum() def formula_position_probs(history_df, next_dt, alpha=0.40, beta=0.25, gamma=0.25, delta=0.10, decay=0.985, smoothing=1.0): """ Formula per posisi: P_final(d) = alpha*P_global(d) + beta*P_recency(d) + gamma*P_time(d) + delta*P_markov(d) P_global = frekuensi digit sepanjang data P_recency = frekuensi berbobot; data terbaru lebih besar bobotnya P_time = frekuensi pada jam/slot yang sama P_markov = peluang digit saat ini berdasarkan digit posisi yang sama pada draw sebelumnya """ h = history_df.sort_values("datetime").reset_index(drop=True).copy() next_time = pd.to_datetime(next_dt).strftime("%H:%M") n = len(h) weights = decay ** np.arange(n - 1, -1, -1) out = {} for pos_col in DIGIT_COLS: values = h[pos_col].astype(int).values p_global = np.array([(np.sum(values == d) + smoothing) / (len(values) + 10*smoothing) for d in range(10)]) p_recency = np.array([ (np.sum(weights * (values == d)) + smoothing) / (np.sum(weights) + 10*smoothing) for d in range(10) ]) same_slot = h[h["time"] == next_time] if len(same_slot) >= 5: sv = same_slot[pos_col].astype(int).values p_time = np.array([(np.sum(sv == d) + smoothing) / (len(sv) + 10*smoothing) for d in range(10)]) else: p_time = p_global.copy() if len(h) >= 2: last_digit = int(h[pos_col].iloc[-1]) prev_vals = h[pos_col].shift(1).values[1:].astype(int) curr_vals = h[pos_col].values[1:].astype(int) mask = prev_vals == last_digit if np.sum(mask) >= 3: mv = curr_vals[mask] p_markov = np.array([(np.sum(mv == d) + smoothing) / (len(mv) + 10*smoothing) for d in range(10)]) else: p_markov = p_global.copy() else: p_markov = p_global.copy() out[pos_col] = normalize_prob(alpha*p_global + beta*p_recency + gamma*p_time + delta*p_markov) return out # ----------------------- # TRAIN & PREDICT # ----------------------- def train_models(df): X, y = build_features(df) split = int(len(df) * 0.80) X_train, X_test = X.iloc[:split], X.iloc[split:] y_train, y_test = y.iloc[:split], y.iloc[split:] final_models = {} report_rows = [] for pos_col in DIGIT_COLS: test_model = RandomForestClassifier( n_estimators=200, random_state=RANDOM_STATE, min_samples_leaf=2, max_features="sqrt", n_jobs=-1 ) test_model.fit(X_train, y_train[pos_col]) pred = test_model.predict(X_test) proba = test_model.predict_proba(X_test) classes = list(test_model.classes_) top3_ok = [] for i, true_digit in enumerate(y_test[pos_col].values): top3_digits = [classes[j] for j in np.argsort(proba[i])[::-1][:3]] top3_ok.append(int(true_digit in top3_digits)) report_rows.append({ "posisi": pos_col, "akurasi_top1": accuracy_score(y_test[pos_col], pred), "akurasi_top3": float(np.mean(top3_ok)), "jumlah_test": len(X_test) }) # Model final pakai semua data. final_model = RandomForestClassifier( n_estimators=200, random_state=RANDOM_STATE, min_samples_leaf=2, max_features="sqrt", n_jobs=-1 ) final_model.fit(X, y[pos_col]) final_models[pos_col] = final_model return final_models, pd.DataFrame(report_rows) def align_model_proba(model, X_row): raw = model.predict_proba(X_row)[0] p = np.zeros(10, dtype=float) for cls, prob in zip(model.classes_, raw): p[int(cls)] = prob return normalize_prob(p) def next_draw_times(last_dt, n=6): last_dt = pd.to_datetime(last_dt) out = [] for _ in range(n): current_date = last_dt.date() found = None for t in DRAW_TIMES: candidate = pd.to_datetime(f"{current_date} {t}") if candidate > last_dt: found = candidate break if found is None: next_date = (last_dt + pd.Timedelta(days=1)).date() found = pd.to_datetime(f"{next_date} {DRAW_TIMES[0]}") out.append(found) last_dt = found return out def add_result_fields(df): df = df.copy() for i, col in enumerate(DIGIT_COLS): df[col] = df["result"].str[i].astype(int) df["result_int"] = df["result"].astype(int) df["digit_sum"] = df[DIGIT_COLS].sum(axis=1) df["odd_count"] = df[DIGIT_COLS].apply(lambda r: sum(int(x % 2 == 1) for x in r), axis=1) df["unique_digit_count"] = df["result"].apply(lambda s: len(set(map(int, s)))) df["missing_calc"] = df["result"].apply(lambda s: sorted(list(ALL_DIGITS - set(map(int, s))))) df["missing_given_sorted"] = df["missing_calc"] df["missing_ok"] = True df["missing_count"] = df["missing_calc"].apply(len) df["date"] = df["datetime"].dt.date.astype(str) df["time"] = df["datetime"].dt.strftime("%H:%M") df["hour"] = df["datetime"].dt.hour df["minute"] = df["datetime"].dt.minute df["day"] = df["datetime"].dt.day df["month"] = df["datetime"].dt.month df["dayofweek"] = df["datetime"].dt.dayofweek df["draw_slot"] = df["time"].map({t: i for i, t in enumerate(DRAW_TIMES)}).fillna(-1).astype(int) for d in range(10): df[f"miss_{d}"] = df["missing_calc"].apply(lambda xs, d=d: int(d in xs)) return df def make_future_feature_row(history_df, future_dt): dummy = pd.DataFrame([{ "datetime": pd.to_datetime(future_dt), "result": "0000", "missing_given": list(range(1, 10)) }]) dummy = add_result_fields(dummy) temp = pd.concat([history_df, dummy], ignore_index=True) X, _ = build_features(temp) return X.tail(1) def combine_probs(rf_probs, formula_probs, rf_weight=0.60, formula_weight=0.40): return { pos_col: normalize_prob(rf_weight*np.array(rf_probs[pos_col]) + formula_weight*np.array(formula_probs[pos_col])) for pos_col in DIGIT_COLS } def candidates_from_position_probs(position_probs, top_k_each_pos=6, top_n=20): top_digits = [np.argsort(position_probs[pos_col])[::-1][:top_k_each_pos].tolist() for pos_col in DIGIT_COLS] rows = [] for combo in itertools.product(*top_digits): score = 1.0 for pos_col, digit in zip(DIGIT_COLS, combo): score *= float(position_probs[pos_col][digit]) rows.append({ "candidate": "".join(map(str, combo)), "score": score, "missing_digits": ",".join(map(str, sorted(list(ALL_DIGITS - set(combo))))) }) out = pd.DataFrame(rows).sort_values("score", ascending=False).head(top_n).reset_index(drop=True) out["rank"] = np.arange(1, len(out) + 1) out["score_pct"] = out["score"] / out["score"].sum() * 100 return out[["rank", "candidate", "score", "score_pct", "missing_digits"]] def predict_next(df, models, n_draws=6, top_n_numbers=20): history = df.copy() rows = [] for future_dt in next_draw_times(history["datetime"].max(), n=n_draws): X_row = make_future_feature_row(history, future_dt) rf_probs = {pos: align_model_proba(models[pos], X_row) for pos in DIGIT_COLS} formula_probs = formula_position_probs(history, future_dt) final_probs = combine_probs(rf_probs, formula_probs) cand = candidates_from_position_probs(final_probs, top_k_each_pos=6, top_n=top_n_numbers) cand.insert(0, "predicted_datetime", pd.to_datetime(future_dt)) rows.append(cand) # Untuk memprediksi beberapa jadwal ke depan, kandidat #1 dipakai sebagai history sementara. top_number = cand.iloc[0]["candidate"] add = pd.DataFrame([{ "datetime": pd.to_datetime(future_dt), "result": top_number, "missing_given": sorted(list(ALL_DIGITS - set(map(int, top_number)))) }]) add = add_result_fields(add) history = pd.concat([history, add[history.columns]], ignore_index=True) return pd.concat(rows, ignore_index=True) # ----------------------- # MAIN # ----------------------- def main(): file_path = resolve_file_path(FILE_PATH) text = Path(file_path).read_text(encoding="utf-8", errors="replace") df = parse_prize_txt(text, DATE_MODE) print(f"Jumlah baris terstruktur: {len(df)}") print(f"Rentang data: {df['datetime'].min()} sampai {df['datetime'].max()}") print(f"Kecocokan rumus missing digit: {df['missing_ok'].mean()*100:.2f}%") df.to_csv("prize2_clean.csv", index=False) X, y = build_features(df) features = pd.concat([df[["datetime", "result"]], X, y.add_prefix("target_")], axis=1) features.to_csv("prize2_features.csv", index=False) models, report = train_models(df) report.to_csv("prize2_model_report.csv", index=False) print("\nEvaluasi model pada 20% data terakhir:") print(report.to_string(index=False)) predictions = predict_next(df, models, NEXT_N_DRAWS, TOP_N_NUMBERS) predictions.to_csv("prize2_next_predictions.csv", index=False) print("\nTop 5 kandidat tiap jadwal berikutnya:") print(predictions.groupby("predicted_datetime").head(5).to_string(index=False)) print("\nOutput file:") print("- prize2_clean.csv") print("- prize2_features.csv") print("- prize2_model_report.csv") print("- prize2_next_predictions.csv") return df, features, report, predictions if __name__ == "__main__": df, features, report, predictions = main() ''' Path('/mnt/data/temp_script2.py').write_text(script2) len(script2.splitlines()), len(script2) (475, 17464)
pythonscript3 = script2 script3 = script3.replace('n_estimators=200', 'n_estimators=80') # Remove rolling frequency loop block. Since it's part with for win in [20,100]. import re script3 = re.sub(r'\n # Rolling frekuensi digit sederhana, cepat, dan informatif\.\n for win in $$20, 100$$:\n for col in DIGIT_COLS:\n shifted = df$$col$$\.shift$1$\n for digit in range$10$:\n feats$$f"freq\{win\}_\{col\}_\{digit\}"$$ = $shifted == digit$\.astype$float$\.rolling$win, min_periods=2$\.mean\(\)\n', '\n', script3) Path('/mnt/data/temp_script3.py').write_text(script3) len(script3.splitlines()) 469
pythondf_clean=pd.read_csv('/mnt/data/prize2_clean.csv') df_clean[~df_clean['missing_ok']].head(), len(df_clean[~df_clean['missing_ok']]) ( datetime result ... miss_8 miss_9 559 2026-05-16 06:30:00 5993 ... 1 0 [1 rows x 33 columns], 1)
Sudah saya ubah data TXT Anda menjadi paket siap Colab, termasuk data bersih, fitur model, laporan evaluasi, dan kandidat hasil berikutnya secara probabilistik. Sumbernya dari file PRIZE_2_ONLY.txt yang Anda upload. (PRIZE_2_ONLY.txt)
Perlu dicatat: data undian/angka seperti ini tidak punya rumus pasti. Script ini mencari pola historis, bukan menjamin hasil selanjutnya. Dari uji 20% data terakhir, akurasi top-1 tiap posisi hanya sekitar 7.5%–12.8%, jadi hasilnya sebaiknya dianggap sebagai analisis probabilitas.
Rumus yang dipakai:
P_{final}(d)=0.40P_{global}(d)+0.25P_{recency}(d)+0.25P_{time}(d)+0.10P_{markov}(d)
Dan untuk kandidat 4D:
score = P_posisi1 × P_posisi2 × P_posisi3 × P_posisi4
File siap pakai:
Cara pakai di Google Colab: upload notebook .ipynb, upload file PRIZE_2_ONLY.txt, lalu jalankan semua cell. Saya juga buat opsi DATE_MODE; default-nya date_before_result, karena di file Anda baris tanggal terlihat berada sebelum angka berikutnya.