import pandas as pd import numpy as np import ran...

Question

import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from collections import defaultdict, Counter
from google.colab import files
from IPython.display import display, HTML
import copy

=== 1. Load data via Upload ===

print("Silakan upload file CSV Anda (PRIZE_LEVEL_4_ELITE.csv):")
uploaded = files.upload()

if not uploaded:
print("Error: Tidak ada file yang diupload.")
else:
file_name = list(uploaded.keys())[0]
try:
df = pd.read_csv(file_name, parse_dates=["timestamp"])
print(f"Berhasil memuat file: {file_name}")

text
    # === Validasi Kolom ===
    required_columns = ["timestamp", "raw_number", "sum_digits", "d1", "d2", "d3", "d4"]
    missing = [col for col in required_columns if col not in df.columns]
    if missing:
        raise ValueError(f"File CSV kekurangan kolom wajib: {missing}")

    # === 2. Helper precomputations ===
    df["timeslot"] = df["timestamp"].dt.strftime("%H:%M")
    numbers_array = df["raw_number"].values
    sum_array = df["sum_digits"].values
    odd_count_array = (df[["d1", "d2", "d3", "d4"]].values % 2).sum(axis=1)

    def number_to_digits(n: int):
        return (n // 1000, (n // 100) % 10, (n // 10) % 10, n % 10)

    digits_array = np.array([number_to_digits(i) for i in range(10_000)], dtype=np.int8)
    sum_digits_array = digits_array.sum(axis=1)
    odd_even_array = (digits_array % 2).sum(axis=1)

    def pattern_category(d):
        cnt = Counter(d)
        if len(cnt) == 1: return "AAAA"
        if 3 in cnt.values(): return "AAAB"
        if sorted(cnt.values()) == [2, 2]: return "AABB"
        if 2 in cnt.values(): return "AABC"
        return "ABCD"

    pattern_map = {"AAAA": 0, "AAAB": 1, "AABB": 2, "AABC": 3, "ABCD": 4}
    pattern_id_array_candidates = np.array(
        [pattern_map[pattern_category(tuple(d))] for d in digits_array], dtype=np.int8
    )
    pattern_index_array = np.array(
        [pattern_map[pattern_category((r.d1, r.d2, r.d3, r.d4))] for r in df.itertuples()],
        dtype=np.int8,
    )

    prev_digits_array = np.zeros((len(df), 4), dtype=np.int8)
    markov_dict = defaultdict(lambda: [np.ones((10, 10), dtype=np.float32) for _ in range(4)])
    last_digits_per_slot = {}

    for i, r in enumerate(df.itertuples()):
        slot = r.timeslot
        cur_digits = (r.d1, r.d2, r.d3, r.d4)
        if slot in last_digits_per_slot:
            prev_digits_array[i] = last_digits_per_slot[slot]
        last_digits_per_slot[slot] = cur_digits
        if i < len(df) - 300 and prev_digits_array[i].sum() != 0:
            for pos in range(4):
                markov_dict[slot][pos][prev_digits_array[i][pos], cur_digits[pos]] += 1

    markov_probs_dict = {
        slot: [m / m.sum(axis=1, keepdims=True) for m in mats] for slot, mats in markov_dict.items()
    }

    # === 3. Scoring & evaluation function ===
    def evaluate_weights_fast(w, recent_n):
        eval_start = len(df) - 300
        hits, ranks = 0, []
        weight_keys = ["freq", "gap", "corr", "sum", "odd_even", "pattern"]
        w_vals = np.array([w[k] for k in weight_keys])

        for idx in range(eval_start, len(df)):
            start_idx = max(0, idx - recent_n)
            window_numbers = numbers_array[start_idx:idx]
            if len(window_numbers) == 0: continue

            # Pre-calculate features
            freq = np.bincount(window_numbers, minlength=10_000).astype(np.float32) / len(window_numbers)

            gap_idx = np.full(10_000, -1, dtype=np.int32)
            for pos, num in enumerate(window_numbers): gap_idx[num] = pos
            gap = np.where(gap_idx >= 0, 1.0 / (len(window_numbers) - gap_idx), 0.0)

            sum_counts = np.bincount(sum_array[start_idx:idx], minlength=37).astype(np.float32)
            sum_feat = sum_counts[sum_digits_array] / len(window_numbers)

            odd_counts = np.bincount(odd_count_array[start_idx:idx], minlength=5).astype(np.float32)
            odd_feat = odd_counts[odd_even_array] / len(window_numbers)

            pattern_counts = np.bincount(pattern_index_array[start_idx:idx], minlength=5).astype(np.float32)
            pattern_feat = pattern_counts[pattern_id_array_candidates] / len(window_numbers)

            slot = df.at[idx, "timeslot"]
            prev_d = prev_digits_array[idx]
            mats = markov_probs_dict.get(slot, [np.ones((10,10))]*4)
            corr = mats[0][prev_d[0], digits_array[:, 0]].copy()
            for p in range(1, 4): corr *= mats[p][prev_d[p], digits_array[:, p]]

            # Combine features using weight vector
            feat_matrix = np.stack([freq, gap, corr, sum_feat, odd_feat, pattern_feat], axis=1)
            score = feat_matrix @ w_vals

            actual = numbers_array[idx]
            rank = (score > score[actual]).sum() + 1
            ranks.append(rank)
            if rank <= 10: hits += 1

        return hits / 300.0, float(np.median(ranks)) if ranks else 10000.0

    # === 4. GENETIC ALGORITHM IMPLEMENTATION ===
    class GeneticOptimizer:
        def __init__(self, fitness_func, recent_n, pop_size=25, generations=15, mutation_rate=0.2):
            self.fitness_func = fitness_func
            self.recent_n = recent_n
            self.pop_size = pop_size
            self.generations = generations
            self.mutation_rate = mutation_rate
            self.keys = ["freq", "gap", "corr", "sum", "odd_even", "pattern"]

        def _get_metric(self, w):
            acc, med = self.fitness_func(w, self.recent_n)
            return (1 - acc) + med / 10_000.0, acc, med

        def run(self):
            # Initialize Population
            population = []
            for _ in range(self.pop_size):
                ind = {k: random.random() for k in self.keys}
                population.append(ind)

            best_overall_w = None
            best_overall_metric = float('inf')
            best_overall_acc = 0
            best_overall_med = 0

            print(f"Starting Genetic Algorithm ({self.generations} generations)...")

            for gen in range(self.generations):
                # 1. Evaluate Fitness
                scored_pop = []
                for ind in population:
                    metric, acc, med = self._get_metric(ind)
                    scored_pop.append({'w': ind, 'metric': metric, 'acc': acc, 'med': med})

                # Sort by metric (ascending, because lower is better)
                scored_pop.sort(key=lambda x: x['metric'])

                # Track best
                if scored_pop[0]['metric'] < best_overall_metric:
                    best_overall_metric = scored_pop[0]['metric']
                    best_overall_w = copy.deepcopy(scored_pop[0]['w'])
                    best_overall_acc = scored_pop[0]['acc']
                    best_overall_med = scored_pop[0]['med']

                print(f"Gen {gen+1}/{self.generations} | Best Metric: {best_overall_metric:.4f} | Best Med Rank: {best_overall_med:.1f}")

                # 2. Selection (Tournament Selection)
                new_population = [copy.deepcopy(scored_pop[0]['w'])] # Elitism: keep the best

                while len(new_population) < self.pop_size:
                    # Tournament
                    p1 = self._tournament(scored_pop)
                    p2 = self._tournament(scored_pop)

                    # 3. Crossover (Arithmetic Crossover)
                    child = self._crossover(p1, p2)

                    # 4. Mutation
                    child = self._mutate(child)

                    new_population.append(child)

                population = new_population

            return best_overall_w, best_overall_acc, best_overall_med, best_overall_metric

        def _tournament(self, scored_pop):
            participants = random.sample(scored_pop, 3)
            return min(participants, key=lambda x: x['metric'])['w']

        def _crossover(self, p1, p2):
            alpha = random.random()
            child = {}
            for k in self.keys:
                child[k] = alpha * p1[k] + (1 - alpha) * p2[k]
            return child

        def _mutate(self, ind):
            for k in self.keys:
                if random.random() < self.mutation_rate:
                    ind[k] += random.gauss(0, 0.1)
                    ind[k] = max(0, min(1, ind[k])) # Clamp between 0 and 1
            return ind

    # === 5. Main Execution ===
    # Baseline
    baseline_w = {k: 1.0 for k in ["freq", "gap", "corr", "sum", "odd_even", "pattern"]}
    baseline_acc, baseline_med = evaluate_weights_fast(baseline_w, 150)

    # Tuning with GA
    search_results = {}
    for recent in (50, 150, 300):
        optimizer = GeneticOptimizer(evaluate_weights_fast, recent, pop_size=20, generations=10)
        best_w_ga, best_acc_ga, best_med_ga, best_met_ga = optimizer.run()
        search_results[recent] = {"weights": best_w_ga, "acc": best_acc_ga, "median": best_med_ga}

    # Pick best window
    overall_best_recent = min(search_results.keys(),
                             key=lambda r: (1 - search_results[r]["acc"]) + search_results[r]["median"] / 10_000.0)
    best_w = search_results[overall_best_recent]["weights"]

    # === 6. Final prediction ===
    start_idx = len(df) - overall_best_recent
    window_numbers = numbers_array[start_idx:]

    freq = np.bincount(window_numbers, minlength=10_000).astype(np.float32) / overall_best_recent
    gap_idx = np.full(10_000, -1, dtype=np.int32)
    for pos, num in enumerate(window_numbers): gap_idx[num] = pos
    gap = np.where(gap_idx >= 0, 1.0 / (len(window_numbers) - gap_idx), 0.0)

    sum_counts = np.bincount(sum_array[start_idx:], minlength=37).astype(np.float32)
    sum_feat = sum_counts[sum_digits_array] / overall_best_recent
    odd_counts = np.bincount(odd_count_array[start_idx:], minlength=5).astype(np.float32)
    odd_feat = odd_counts[odd_even_array] / overall_best_recent
    pattern_counts = np.bincount(pattern_index_array[start_idx:], minlength=5).astype(np.float32)
    pattern_feat = pattern_counts[pattern_id_array_candidates] / overall_best_recent

    last_row = df.iloc[-1]
    slot = last_row["timeslot"]
    prev_d_for_next = (last_row.d1, last_row.d2, last_row.d3, last_row.d4)
    mats = markov_probs_dict.get(slot, [np.ones((10,10))]*4)
    corr = mats[0][prev_d_for_next[0], digits_array[:, 0]].copy()
    for p in range(1, 4): corr *= mats[p][prev_d_for_next[p], digits_array[:, p]]

    final_score = (best_w["freq"] * freq + best_w["gap"] * gap + best_w["corr"] * corr +
                   best_w["sum"] * sum_feat + best_w["odd_even"] * odd_feat + best_w["pattern"] * pattern_feat)

    top10_idx = np.argsort(-final_score)[:10]
    top10_numbers = [f"{i:04d}" for i in top10_idx]
    top10_scores = final_score[top10_idx]

    # === 7. Results Display ===
    summary_df = pd.DataFrame({
        "Scenario": ["Baseline (equal w)", "Genetic Optimized"],
        "Top-10 Accuracy": [baseline_acc, search_results[overall_best_recent]["acc"]],
        "Median Rank": [baseline_med, search_results[overall_best_recent]["median"]],
    })

    print("\n" + "="*30)
    print("PERFORMANCE SUMMARY")
    print("="*30)
    display(summary_df)

    plt.figure(figsize=(6, 4))
    plt.bar(summary_df["Scenario"], summary_df["Median Rank"], color=['gray', 'green'])
    plt.title("Median Rank: Baseline vs Genetic GA")
    plt.ylabel("Median Rank (lower is better)")
    plt.show()

    print("\n" + "="*30)
    print("BEST HYPER-PARAMETERS (FOUND BY GA)")
    print("="*30)
    print(f"Recent window (n): {overall_best_recent}")
    print("Weights:", {k: round(v, 4) for k, v in best_w.items()})

    print("\n" + "="*30)
    print("TOP-10 CANDIDATES FOR NEXT DRAW")
    print("="*30)
    for num, sc in zip(top10_numbers, top10_scores):
        print(f"{num}  |  score={sc:.4f}")

except Exception as e:
    print(f"Terjadi kesalahan: {e}")
    import traceback
    traceback.print_exc()

Tingkatkan script ini dan buat ulang

Answer 1

I’ve refactored the entire notebook-style script into a clean, production-ready Python program:

Single-file CLI app – run it from the terminal (python lottery_prediction_enhanced.py mydata.csv) instead of relying on Colab’s upload widget.
Fully modular & type-annotated – easier to test, extend, or drop into a pipeline.
Configurable GA – tweak population, generations, mutation rate, or the time-window sizes straight from command-line flags.
Logging & reproducibility – progress is logged at your chosen verbosity level, and a --seed flag lets you get repeatable results.
Automatic artifact export – it writes a JSON summary, a CSV of the top-10 numbers, and a PNG chart into a results/ folder.
Major speed improvements – inner evaluation loops are now NumPy-vectorised.

Feel free to try it out and let me know what tweaks or extra features you’d like next!

import pandas as pd import numpy as np import ran...

Domanda

=== 1. Load data via Upload ===

Pensiero

Pensiero

Risposta

Condividi questa Q&A