奥运奖牌预测分析

تم الإنشاء في: ٢٧ يناير ٢٠٢٥

تمت الإجابة باستخدام O1 بواسطة Chat01

سؤال

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

###############################################################

1) 第一段代码: 奥运奖牌预测 & 进步退步分析

###############################################################

读取 athletes & medal_counts

athletes = pd.read_csv('summerOly_athletes.csv', encoding='ISO-8859-1')
medal_counts = pd.read_csv('summerOly_medal_counts.csv', encoding='ISO-8859-1')

Merge & Preprocess

medal_summary = medal_counts.groupby(['Year','NOC'], as_index=False).agg({
'Gold':'sum','Silver':'sum','Bronze':'sum','Total':'sum'
})
athlete_summary = athletes.groupby(['Year','NOC'], as_index=False).size().rename(columns={'size':'Athletes'})
data = pd.merge(medal_summary, athlete_summary, on=['Year','NOC'], how='left').fillna(0)

Rolling historical features

data.sort_values(['NOC','Year'], inplace=True)
data['Past_Gold_Avg'] = data.groupby('NOC')['Gold'].transform(
lambda x: x.shift().rolling(window=3, min_periods=1).mean())
data['Past_Total_Avg'] = data.groupby('NOC')['Total'].transform(
lambda x: x.shift().rolling(window=3, min_periods=1).mean())
data['Past_Gold_Sum'] = data.groupby('NOC')['Gold'].transform(
lambda x: x.shift().rolling(window=3, min_periods=1).sum())
data['Past_Total_Sum'] = data.groupby('NOC')['Total'].transform(
lambda x: x.shift().rolling(window=3, min_periods=1).sum())
data[['Past_Gold_Avg','Past_Total_Avg','Past_Gold_Sum','Past_Total_Sum']] =
data[['Past_Gold_Avg','Past_Total_Avg','Past_Gold_Sum','Past_Total_Sum']].fillna(0)

features = ['Past_Gold_Avg','Past_Total_Avg','Past_Gold_Sum','Past_Total_Sum']
target_gold = 'Gold'
target_total = 'Total'

def build_model(df, feats, target):
X = df[feats]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2,
random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Model for {target}: MSE = {mse}")
return model

model_gold = build_model(data, features, target_gold)
model_total = build_model(data, features, target_total)

Predict for 2028

unique_nocs = data['NOC'].unique()
df_2024 = data[data['Year']==2024].copy()
group_2024 = df_2024.groupby('NOC', as_index=False)[features].mean()

future_data = pd.DataFrame({'NOC': unique_nocs})
future_data = pd.merge(future_data, group_2024, on='NOC', how='left').fillna(0)

pred_gold = model_gold.predict(future_data[features])
pred_total = model_total.predict(future_data[features])

pred_gold = np.clip(pred_gold, a_min=0, a_max=None)
pred_total = np.clip(pred_total, a_min=0, a_max=None)

future_data['Predicted_Gold'] = pd.Series(pred_gold).round().astype(int)
future_data['Predicted_Total'] = pd.Series(pred_total).round().astype(int)

gold_res = model_gold.predict(data[features]) - data[target_gold]
total_res = model_total.predict(data[features]) - data[target_total]
gold_std = gold_res.std()
total_std = total_res.std()

future_data['Gold_Lower'] = (future_data['Predicted_Gold'] - 2gold_std).clip(lower=0).round().astype(int)
future_data['Gold_Upper'] = (future_data['Predicted_Gold'] + 2
gold_std).clip(lower=0).round().astype(int)
future_data['Total_Lower'] = (future_data['Predicted_Total'] - 2total_std).clip(lower=0).round().astype(int)
future_data['Total_Upper'] = (future_data['Predicted_Total'] + 2
total_std).clip(lower=0).round().astype(int)

future_data.sort_values(by='Predicted_Total', ascending=False, inplace=True)
print("Predicted Medal Table for 2028 Los Angeles Olympics:")
print(
future_data[['NOC','Predicted_Gold','Gold_Lower','Gold_Upper',
'Predicted_Total','Total_Lower','Total_Upper']]
)

future_data.to_csv('Predicted_Medal_Table_2028.csv', index=False)
print("Results saved to 'Predicted_Medal_Table_2028.csv'")

plt.figure(figsize=(12, 8))
sns.barplot(x='NOC', y='Predicted_Total', data=future_data.head(10), palette='viridis')
plt.title('Predicted Total Medals in 2028 (Vertical Bars)')
plt.xlabel('Country (NOC)')
plt.ylabel('Predicted Total Medals')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

future_data['Gold_History'] = data.groupby('NOC')['Past_Gold_Avg'].mean().reindex(unique_nocs).fillna(0).values
future_data['Total_History'] = data.groupby('NOC')['Past_Total_Avg'].mean().reindex(unique_nocs).fillna(0).values

future_data['Gold_Progress'] = future_data['Predicted_Gold'] - future_data['Gold_History']
future_data['Total_Progress'] = future_data['Predicted_Total'] - future_data['Total_History']
future_data['Gold_Trend'] = np.where(future_data['Gold_Progress']>0, 'Progress','Decline')
future_data['Total_Trend'] = np.where(future_data['Total_Progress']>0, 'Progress','Decline')

progress_gold = future_data[['NOC','Predicted_Gold','Gold_Progress','Gold_Trend']]
.sort_values('Gold_Progress', ascending=False)
decline_gold = future_data[['NOC','Predicted_Gold','Gold_Progress','Gold_Trend']]
.sort_values('Gold_Progress', ascending=True)
progress_total = future_data[['NOC','Predicted_Total','Total_Progress','Total_Trend']]
.sort_values('Total_Progress', ascending=False)
decline_total = future_data[['NOC','Predicted_Total','Total_Progress','Total_Trend']]
.sort_values('Total_Progress', ascending=True)

print("\nCountries Most Likely to Improve in Gold:")
print(progress_gold.head(10))
print("\nCountries Most Likely to Decline in Gold:")
print(decline_gold.head(10))
print("\nCountries Most Likely to Improve in Total Medals:")
print(progress_total.head(10))
print("\nCountries Most Likely to Decline in Total Medals:")
print(decline_total.head(10))

plt.figure(figsize=(12, 8))
sns.barplot(x='NOC', y='Gold_Progress', data=progress_gold.head(10), palette='Blues')
plt.title('Top 10 Countries Likely to Improve in Gold (Vertical Bars)')
plt.xlabel('Country (NOC)')
plt.ylabel('Gold Medal Progress')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 8))
sns.barplot(x='NOC', y='Gold_Progress', data=decline_gold.head(10), palette='Reds')
plt.title('Top 10 Countries Likely to Decline in Gold (Vertical Bars)')
plt.xlabel('Country (NOC)')
plt.ylabel('Gold Medal Decline')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 8))
sns.barplot(x='NOC', y='Total_Progress', data=progress_total.head(10), palette='Blues')
plt.title('Top 10 Countries Likely to Improve in Total Medals (Vertical Bars)')
plt.xlabel('Country (NOC)')
plt.ylabel('Total Medal Progress')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 8))
sns.barplot(x='NOC', y='Total_Progress', data=decline_total.head(10), palette='Reds')
plt.title('Top 10 Countries Likely to Decline in Total Medals (Vertical Bars)')
plt.xlabel('Country (NOC)')
plt.ylabel('Total Medal Decline')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

future_data_gold_sorted = future_data.sort_values('Predicted_Gold', ascending=False)
top10_gold = future_data_gold_sorted.head(10)
others_gold = future_data_gold_sorted.iloc[10:]['Predicted_Gold'].sum()
labels = list(top10_gold['NOC'].values) + ['Others']
values = list(top10_gold['Predicted_Gold'].values) + [others_gold]

top_colors = sns.color_palette('Set2', n_colors=len(top10_gold))
others_color = [(0.5,0.5,0.5)]
colors = top_colors + others_color
explode = [0]*(len(top10_gold)) + [0.1]

plt.figure(figsize=(8,8))
plt.pie(
values,
labels=labels,
colors=colors,
explode=explode,
autopct='%1.1f%%',
startangle=160,
wedgeprops={'alpha':0.9},
labeldistance=1.12,
pctdistance=0.75
)
plt.title("Predicted Gold Share: Top 10 vs. Others (2D Pie)")
plt.axis('equal')
plt.tight_layout()
plt.show()

###############################################################

2) 第二段代码: “断轴”示例 (保留原逻辑 + 输出)

###############################################################
countries = ["Yugoslavia","Turkey","Croatia","Canada","Costa Rica",
"Latvia","Indonesia","Nigeria","Kyrgyzstan","Finland"]
values = [0.67,0.65,0.52,0.50,0.49,0.48,0.48,0.47,0.47,0.46]
gold_prob_mod = pd.Series(data=values, index=countries)

x_labels = gold_prob_mod.index
x_vals = np.arange(len(x_labels))

fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(8,6))
fig.subplots_adjust(hspace=0.05)

ax1.bar(x_vals, gold_prob_mod, color='skyblue', edgecolor='black')
ax1.set_ylim(0.45, 0.7)
ax1.spines['bottom'].set_visible(False)
ax1.tick_params(labelbottom=False)

ax2.bar(x_vals, gold_prob_mod, color='skyblue', edgecolor='black')
ax2.set_ylim(0, 0.4)

d = .01
kwargs = dict(transform=ax1.transAxes, color='k', clip_on=False)
ax1.plot((-d, +d), (-d, +d), **kwargs)
ax1.plot((1 - d, 1 + d), (-d, +d), **kwargs)

kwargs = dict(transform=ax2.transAxes, color='k', clip_on=False)
ax2.plot((-d, +d), (1 - d, 1 + d), **kwargs)
ax2.plot((1 - d, 1 + d), (1 - d, 1 + d), **kwargs)

ax2.set_xticks(x_vals)
ax2.set_xticklabels(x_labels, rotation=45, ha='right')
ax2.set_xlabel("Country (NOC)")
ax2.set_ylabel("Probability of 'trend>0'")
ax1.set_title("Top 10 Zero-Gold Countries (Broken Y-Axis)")

plt.tight_layout()
plt.show()

###############################################################

========== PART A: 生成 yearly_summary并画两张图 ==========

###############################################################
base_dir = r"D:\浏览器下载\pycharm\olympic\my_olympic_project.venv"
programs_file_path = os.path.join(base_dir, 'summerOly_programs.csv')
medal_counts_file_path = os.path.join(base_dir, 'summerOly_medal_counts.csv')
output_excel = os.path.join(base_dir, 'yearly_summary.xlsx')

programs_df = pd.read_csv(programs_file_path, encoding='ISO-8859-1')
medal_counts_df = pd.read_csv(medal_counts_file_path, encoding='ISO-8859-1')

programs_df_cleaned = programs_df.loc[:, ~programs_df.columns.str.contains("Code|Sport|Discipline|Sports Governing Body")]
programs_df_cleaned = programs_df_cleaned.rename(columns=lambda x: x.strip().replace('*','') if isinstance(x, str) else x)
programs_df_cleaned = programs_df_cleaned.apply(pd.to_numeric, errors='coerce')

programs_melted_cleaned = programs_df_cleaned.melt(var_name='Year', value_name='Number_of_Events')
programs_melted_cleaned['Year'] = pd.to_numeric(programs_melted_cleaned['Year'], errors='coerce', downcast='integer')

medal_counts_per_year = medal_counts_df.groupby(['Year','NOC'])['Gold'].sum().reset_index()
merged_data_cleaned = pd.merge(medal_counts_per_year, programs_melted_cleaned, how='left', on='Year')

yearly_summary = merged_data_cleaned.groupby('Year').agg({
'Number_of_Events':'sum',
'Gold':'sum'
}).reset_index()

yearly_summary['Gold_to_Event_Ratio'] = yearly_summary['Gold'] / yearly_summary['Number_of_Events']
yearly_summary['Gold_to_Event_Ratio'] = yearly_summary['Gold_to_Event_Ratio'].fillna(0)

yearly_summary.to_excel(output_excel, index=False)
print(f"yearly_summary 已保存至: {output_excel}")
print(yearly_summary.head(10))

plt.figure(figsize=(12, 6))
plt.plot(yearly_summary['Year'], yearly_summary['Gold_to_Event_Ratio'], marker='o', color='purple')
plt.title("Gold to Event Ratio per Year (1896-2024)", fontsize=14)
plt.xlabel("Year", fontsize=12)
plt.ylabel("Gold to Event Ratio", fontsize=12)
plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 6))
plt.plot(yearly_summary['Year'], yearly_summary['Number_of_Events'], label="Number of Events", marker='o', color='b')
plt.plot(yearly_summary['Year'], yearly_summary['Gold'], label="Gold Medals", marker='o', color='g')
plt.title("Relationship between Number of Events and Gold Medal Counts (1896-2024)", fontsize=14)
plt.xlabel("Year", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.legend(title="Legend")
plt.tight_layout()
plt.show()

###############################################################

========== PART B: 生成“平滑”模拟数据并画折线图 ==========

###############################################################
N = 20
x_start = 10
x_vals = np.arange(x_start, x_start + N)
gold_list = []

rng = np.random.default_rng(2028)
for x in x_vals:
base = 2*x + 10
wave = math.sin(x/2)*3
y = round(base + wave)
gold_list.append(y)

df_line = pd.DataFrame({
'Number_of_Events': x_vals,
'Gold': gold_list
})

df_line.sort_values('Number_of_Events', inplace=True)
excel_name = "smooth_line_data_no_host.xlsx"
df_line.to_excel(excel_name, index=False)
print(f"Data has been saved to {excel_name}")
print(df_line.head(10))

plt.figure(figsize=(10,6))
plt.plot(
df_line['Number_of_Events'],
df_line['Gold'],
marker='o', color='blue', linewidth=2
)
plt.title("Smooth Relationship: Number_of_Events vs. Gold (No Host Column)", fontsize=14)
plt.xlabel("Number_of_Events (integer)", fontsize=12)
plt.ylabel("Gold (simulated)", fontsize=12)
plt.xticks(df_line['Number_of_Events'])
plt.tight_layout()
plt.show()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

固定随机种子,方便复现

np.random.seed(42)

国家与项目列表

countries = ['China', 'USA', 'Romania', 'Japan', 'Russia', 'Germany', 'Brazil']
sports = ['Volleyball', 'Gymnastics', 'Basketball', 'Football', 'Tennis', 'Swimming', 'Track & Field']

构造示例数据

medals_data = []
for country in countries:
for sport in sports:
gold = np.random.randint(0, 10) # 金牌
silver = np.random.randint(0, 10) # 银牌
bronze = np.random.randint(0, 10) # 铜牌
total = gold + silver + bronze
medals_data.append([country, sport, gold, silver, bronze, total])

df = pd.DataFrame(medals_data, columns=['Country','Sport','Gold','Silver','Bronze','Total'])

教练对部分国家/项目的加成

coach_effect = {
'China': {'Volleyball': 3},
'USA': {'Gymnastics': 4},
'Romania': {'Gymnastics': 5}
}

奖牌数据加上教练加成

for country, effect_dict in coach_effect.items():
for sport, effect in effect_dict.items():
df.loc[(df['Country']==country)&(df['Sport']==sport), 'Gold'] += effect

print("Updated Medal Data with Coach Effects:")
print(df.head(15)) # 打印前15行查看

--------让图表更平滑的关键:对 Sport 做整数映射--------

sports_sorted = sorted(sports) # 对项目按字母排序
sport_to_idx = {s: i for i, s in enumerate(sports_sorted)}

在原 df 上添加 SportIndex 列

df['SportIndex'] = df['Sport'].map(sport_to_idx)

排序后方便下面同时导出 Excel

df.sort_values(['Country','SportIndex'], inplace=True)

导出到 Excel 便于查看/复用

excel_output = "chart_data_coach_effect.xlsx"
df.to_excel(excel_output, index=False)
print(f"\nData used for chart saved to {excel_output}\n")

----------绘图:x=SportIndex, y=Gold,按国家分别画-----------

plt.figure(figsize=(10, 6))
for country in countries:
# 过滤该国数据
cdata = df[df['Country']==country]
# 画折线
plt.plot(cdata['SportIndex'], cdata['Gold'], marker='o', linewidth=2, label=country)

plt.title("Gold Medals by Sport (with Coach Effect, Smoothed X-axis)", fontsize=14)
plt.xlabel("Sport (Mapped to Index)", fontsize=12)
plt.ylabel("Gold Medals", fontsize=12)

设置 x 轴刻度:整数->项目名称

plt.xticks(range(len(sports_sorted)), sports_sorted, rotation=45)
plt.legend(title="Country")
plt.tight_layout()
plt.show()请你整合这两段代码,保证他们的各自输出的结果不变

الفكرة

Integrating code

I'm thinking through combining two large code blocks into a single script, ensuring each block's results stay unchanged. Considering combining the Olympic medal prediction and progress analysis with the yearly summary and smoothed simulation data generation code.

Integrating tasks

Combining the Olympic medal prediction, progress analysis, broken axis example, PART A and PART B code, and the "coach effect" snippet into a streamlined script. Each task's unique output remains untouched.

Crafting a cohesive script

I’m working on a single script that seamlessly includes Olympic medal prediction, progress analysis, broken axis example, PART A, PART B, and the "coach effect" snippet, ensuring each function retains its unique output.

Orchestrating the script

I’m assembling a single script that seamlessly integrates Olympic medal prediction, progress analysis, broken axis example, and "coach effect" code, maintaining each segment's distinct output.

Integrating code sections

Combining Olympic medal prediction, progress analysis, broken axis example, PART A, PART B, and the "coach effect" into a single script, ensuring each segment's distinct output is preserved.

Crafting a cohesive script

I’m assembling a single script that seamlessly integrates Olympic medal prediction, progress analysis, broken axis example, PART A, PART B, and the "coach effect" code, ensuring unique outputs for each section.

Balancing imports

I’m considering placement of import statements, aiming for a clear single script that keeps each code block's outputs intact and avoids renaming variables for consistency.

Thought about code integration and analysis,持续 15 秒

الإجابة

下面给出一份将所有代码按原顺序整合在一起的示例脚本。这样做可以确保每个部分的逻辑和输出都与原本相同,互不干扰。你可以直接将此脚本保存为一个 .py 文件运行,或者在交互式环境(Jupyter/VSCode/Spyder等)中逐段执行。


python
import os import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import math from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error ########################################################################## # 1) 第一段代码: 奥运奖牌预测 & 进步退步分析 ########################################################################## # 读取 athletes & medal_counts athletes = pd.read_csv('summerOly_athletes.csv', encoding='ISO-8859-1') medal_counts = pd.read_csv('summerOly_medal_counts.csv', encoding='ISO-8859-1') # Merge & Preprocess medal_summary = medal_counts.groupby(['Year','NOC'], as_index=False).agg({ 'Gold':'sum','Silver':'sum','Bronze':'sum','Total':'sum' }) athlete_summary = athletes.groupby(['Year','NOC'], as_index=False).size().rename(columns={'size':'Athletes'}) data = pd.merge(medal_summary, athlete_summary, on=['Year','NOC'], how='left').fillna(0) # Rolling historical features data.sort_values(['NOC','Year'], inplace=True) data['Past_Gold_Avg'] = data.groupby('NOC')['Gold'].transform( lambda x: x.shift().rolling(window=3, min_periods=1).mean()) data['Past_Total_Avg'] = data.groupby('NOC')['Total'].transform( lambda x: x.shift().rolling(window=3, min_periods=1).mean()) data['Past_Gold_Sum'] = data.groupby('NOC')['Gold'].transform( lambda x: x.shift().rolling(window=3, min_periods=1).sum()) data['Past_Total_Sum'] = data.groupby('NOC')['Total'].transform( lambda x: x.shift().rolling(window=3, min_periods=1).sum()) data[['Past_Gold_Avg','Past_Total_Avg','Past_Gold_Sum','Past_Total_Sum']] = \ data[['Past_Gold_Avg','Past_Total_Avg','Past_Gold_Sum','Past_Total_Sum']].fillna(0) features = ['Past_Gold_Avg','Past_Total_Avg','Past_Gold_Sum','Past_Total_Sum'] target_gold = 'Gold' target_total = 'Total' def build_model(df, feats, target): X = df[feats] y = df[target] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) model = LinearRegression() model.fit(X_train, y_train) y_pred = model.predict(X_test) mse = mean_squared_error(y_test, y_pred) print(f"Model for {target}: MSE = {mse}") return model model_gold = build_model(data, features, target_gold) model_total = build_model(data, features, target_total) # Predict for 2028 unique_nocs = data['NOC'].unique() df_2024 = data[data['Year']==2024].copy() group_2024 = df_2024.groupby('NOC', as_index=False)[features].mean() future_data = pd.DataFrame({'NOC': unique_nocs}) future_data = pd.merge(future_data, group_2024, on='NOC', how='left').fillna(0) pred_gold = model_gold.predict(future_data[features]) pred_total = model_total.predict(future_data[features]) pred_gold = np.clip(pred_gold, a_min=0, a_max=None) pred_total = np.clip(pred_total, a_min=0, a_max=None) future_data['Predicted_Gold'] = pd.Series(pred_gold).round().astype(int) future_data['Predicted_Total'] = pd.Series(pred_total).round().astype(int) gold_res = model_gold.predict(data[features]) - data[target_gold] total_res = model_total.predict(data[features]) - data[target_total] gold_std = gold_res.std() total_std = total_res.std() future_data['Gold_Lower'] = (future_data['Predicted_Gold'] - 2*gold_std).clip(lower=0).round().astype(int) future_data['Gold_Upper'] = (future_data['Predicted_Gold'] + 2*gold_std).clip(lower=0).round().astype(int) future_data['Total_Lower'] = (future_data['Predicted_Total'] - 2*total_std).clip(lower=0).round().astype(int) future_data['Total_Upper'] = (future_data['Predicted_Total'] + 2*total_std).clip(lower=0).round().astype(int) future_data.sort_values(by='Predicted_Total', ascending=False, inplace=True) print("Predicted Medal Table for 2028 Los Angeles Olympics:") print( future_data[['NOC','Predicted_Gold','Gold_Lower','Gold_Upper', 'Predicted_Total','Total_Lower','Total_Upper']] ) future_data.to_csv('Predicted_Medal_Table_2028.csv', index=False) print("Results saved to 'Predicted_Medal_Table_2028.csv'") plt.figure(figsize=(12, 8)) sns.barplot(x='NOC', y='Predicted_Total', data=future_data.head(10), palette='viridis') plt.title('Predicted Total Medals in 2028 (Vertical Bars)') plt.xlabel('Country (NOC)') plt.ylabel('Predicted Total Medals') plt.xticks(rotation=45, ha='right') plt.tight_layout() plt.show() future_data['Gold_History'] = data.groupby('NOC')['Past_Gold_Avg'].mean().reindex(unique_nocs).fillna(0).values future_data['Total_History'] = data.groupby('NOC')['Past_Total_Avg'].mean().reindex(unique_nocs).fillna(0).values future_data['Gold_Progress'] = future_data['Predicted_Gold'] - future_data['Gold_History'] future_data['Total_Progress'] = future_data['Predicted_Total'] - future_data['Total_History'] future_data['Gold_Trend'] = np.where(future_data['Gold_Progress']>0, 'Progress','Decline') future_data['Total_Trend'] = np.where(future_data['Total_Progress']>0, 'Progress','Decline') progress_gold = future_data[['NOC','Predicted_Gold','Gold_Progress','Gold_Trend']]\ .sort_values('Gold_Progress', ascending=False) decline_gold = future_data[['NOC','Predicted_Gold','Gold_Progress','Gold_Trend']]\ .sort_values('Gold_Progress', ascending=True) progress_total = future_data[['NOC','Predicted_Total','Total_Progress','Total_Trend']]\ .sort_values('Total_Progress', ascending=False) decline_total = future_data[['NOC','Predicted_Total','Total_Progress','Total_Trend']]\ .sort_values('Total_Progress', ascending=True) print("\nCountries Most Likely to Improve in Gold:") print(progress_gold.head(10)) print("\nCountries Most Likely to Decline in Gold:") print(decline_gold.head(10)) print("\nCountries Most Likely to Improve in Total Medals:") print(progress_total.head(10)) print("\nCountries Most Likely to Decline in Total Medals:") print(decline_total.head(10)) plt.figure(figsize=(12, 8)) sns.barplot(x='NOC', y='Gold_Progress', data=progress_gold.head(10), palette='Blues') plt.title('Top 10 Countries Likely to Improve in Gold (Vertical Bars)') plt.xlabel('Country (NOC)') plt.ylabel('Gold Medal Progress') plt.xticks(rotation=45, ha='right') plt.tight_layout() plt.show() plt.figure(figsize=(12, 8)) sns.barplot(x='NOC', y='Gold_Progress', data=decline_gold.head(10), palette='Reds') plt.title('Top 10 Countries Likely to Decline in Gold (Vertical Bars)') plt.xlabel('Country (NOC)') plt.ylabel('Gold Medal Decline') plt.xticks(rotation=45, ha='right') plt.tight_layout() plt.show() plt.figure(figsize=(12, 8)) sns.barplot(x='NOC', y='Total_Progress', data=progress_total.head(10), palette='Blues') plt.title('Top 10 Countries Likely to Improve in Total Medals (Vertical Bars)') plt.xlabel('Country (NOC)') plt.ylabel('Total Medal Progress') plt.xticks(rotation=45, ha='right') plt.tight_layout() plt.show() plt.figure(figsize=(12, 8)) sns.barplot(x='NOC', y='Total_Progress', data=decline_total.head(10), palette='Reds') plt.title('Top 10 Countries Likely to Decline in Total Medals (Vertical Bars)') plt.xlabel('Country (NOC)') plt.ylabel('Total Medal Decline') plt.xticks(rotation=45, ha='right') plt.tight_layout() plt.show() future_data_gold_sorted = future_data.sort_values('Predicted_Gold', ascending=False) top10_gold = future_data_gold_sorted.head(10) others_gold = future_data_gold_sorted.iloc[10:]['Predicted_Gold'].sum() labels = list(top10_gold['NOC'].values) + ['Others'] values = list(top10_gold['Predicted_Gold'].values) + [others_gold] top_colors = sns.color_palette('Set2', n_colors=len(top10_gold)) others_color = [(0.5,0.5,0.5)] colors = top_colors + others_color explode = [0]*(len(top10_gold)) + [0.1] plt.figure(figsize=(8,8)) plt.pie( values, labels=labels, colors=colors, explode=explode, autopct='%1.1f%%', startangle=160, wedgeprops={'alpha':0.9}, labeldistance=1.12, pctdistance=0.75 ) plt.title("Predicted Gold Share: Top 10 vs. Others (2D Pie)") plt.axis('equal') plt.tight_layout() plt.show() ########################################################################## # 2) 第二段代码: “断轴”示例 (保留原逻辑 + 输出) ########################################################################## countries = ["Yugoslavia","Turkey","Croatia","Canada","Costa Rica", "Latvia","Indonesia","Nigeria","Kyrgyzstan","Finland"] values = [0.67,0.65,0.52,0.50,0.49,0.48,0.48,0.47,0.47,0.46] gold_prob_mod = pd.Series(data=values, index=countries) x_labels = gold_prob_mod.index x_vals = np.arange(len(x_labels)) fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(8,6)) fig.subplots_adjust(hspace=0.05) ax1.bar(x_vals, gold_prob_mod, color='skyblue', edgecolor='black') ax1.set_ylim(0.45, 0.7) ax1.spines['bottom'].set_visible(False) ax1.tick_params(labelbottom=False) ax2.bar(x_vals, gold_prob_mod, color='skyblue', edgecolor='black') ax2.set_ylim(0, 0.4) d = .01 kwargs = dict(transform=ax1.transAxes, color='k', clip_on=False) ax1.plot((-d, +d), (-d, +d), **kwargs) ax1.plot((1 - d, 1 + d), (-d, +d), **kwargs) kwargs = dict(transform=ax2.transAxes, color='k', clip_on=False) ax2.plot((-d, +d), (1 - d, 1 + d), **kwargs) ax2.plot((1 - d, 1 + d), (1 - d, 1 + d), **kwargs) ax2.set_xticks(x_vals) ax2.set_xticklabels(x_labels, rotation=45, ha='right') ax2.set_xlabel("Country (NOC)") ax2.set_ylabel("Probability of 'trend>0'") ax1.set_title("Top 10 Zero-Gold Countries (Broken Y-Axis)") plt.tight_layout() plt.show() ########################################################################## # ========== PART A: 生成 yearly_summary 并画两张图 ========== ########################################################################## base_dir = r"D:\浏览器下载\pycharm\olympic\my_olympic_project\.venv" programs_file_path = os.path.join(base_dir, 'summerOly_programs.csv') medal_counts_file_path = os.path.join(base_dir, 'summerOly_medal_counts.csv') output_excel = os.path.join(base_dir, 'yearly_summary.xlsx') programs_df = pd.read_csv(programs_file_path, encoding='ISO-8859-1') medal_counts_df = pd.read_csv(medal_counts_file_path, encoding='ISO-8859-1') programs_df_cleaned = programs_df.loc[:, ~programs_df.columns.str.contains("Code|Sport|Discipline|Sports Governing Body")] programs_df_cleaned = programs_df_cleaned.rename(columns=lambda x: x.strip().replace('*','') if isinstance(x, str) else x) programs_df_cleaned = programs_df_cleaned.apply(pd.to_numeric, errors='coerce') programs_melted_cleaned = programs_df_cleaned.melt(var_name='Year', value_name='Number_of_Events') programs_melted_cleaned['Year'] = pd.to_numeric(programs_melted_cleaned['Year'], errors='coerce', downcast='integer') medal_counts_per_year = medal_counts_df.groupby(['Year','NOC'])['Gold'].sum().reset_index() merged_data_cleaned = pd.merge(medal_counts_per_year, programs_melted_cleaned, how='left', on='Year') yearly_summary = merged_data_cleaned.groupby('Year').agg({ 'Number_of_Events':'sum', 'Gold':'sum' }).reset_index() yearly_summary['Gold_to_Event_Ratio'] = yearly_summary['Gold'] / yearly_summary['Number_of_Events'] yearly_summary['Gold_to_Event_Ratio'] = yearly_summary['Gold_to_Event_Ratio'].fillna(0) yearly_summary.to_excel(output_excel, index=False) print(f"yearly_summary 已保存至: {output_excel}") print(yearly_summary.head(10)) plt.figure(figsize=(12, 6)) plt.plot(yearly_summary['Year'], yearly_summary['Gold_to_Event_Ratio'], marker='o', color='purple') plt.title("Gold to Event Ratio per Year (1896-2024)", fontsize=14) plt.xlabel("Year", fontsize=12) plt.ylabel("Gold to Event Ratio", fontsize=12) plt.tight_layout() plt.show() plt.figure(figsize=(12, 6)) plt.plot(yearly_summary['Year'], yearly_summary['Number_of_Events'], label="Number of Events", marker='o', color='b') plt.plot(yearly_summary['Year'], yearly_summary['Gold'], label="Gold Medals", marker='o', color='g') plt.title("Relationship between Number of Events and Gold Medal Counts (1896-2024)", fontsize=14) plt.xlabel("Year", fontsize=12) plt.ylabel("Count", fontsize=12) plt.legend(title="Legend") plt.tight_layout() plt.show() ########################################################################## # ========== PART B: 生成“平滑”模拟数据并画折线图 ========== ########################################################################## N = 20 x_start = 10 x_vals = np.arange(x_start, x_start + N) gold_list = [] rng = np.random.default_rng(2028) for x in x_vals: base = 2*x + 10 wave = math.sin(x/2)*3 y = round(base + wave) gold_list.append(y) df_line = pd.DataFrame({ 'Number_of_Events': x_vals, 'Gold': gold_list }) df_line.sort_values('Number_of_Events', inplace=True) excel_name = "smooth_line_data_no_host.xlsx" df_line.to_excel(excel_name, index=False) print(f"Data has been saved to {excel_name}") print(df_line.head(10)) plt.figure(figsize=(10,6)) plt.plot( df_line['Number_of_Events'], df_line['Gold'], marker='o', color='blue', linewidth=2 ) plt.title("Smooth Relationship: Number_of_Events vs. Gold (No Host Column)", fontsize=14) plt.xlabel("Number_of_Events (integer)", fontsize=12) plt.ylabel("Gold (simulated)", fontsize=12) plt.xticks(df_line['Number_of_Events']) plt.tight_layout() plt.show() ########################################################################## # ========== 示范:加入“教练加成”效果的折线图(示例数据) ========== ########################################################################## # 固定随机种子,方便复现 np.random.seed(42) # 国家与项目列表 countries = ['China', 'USA', 'Romania', 'Japan', 'Russia', 'Germany', 'Brazil'] sports = ['Volleyball', 'Gymnastics', 'Basketball', 'Football', 'Tennis', 'Swimming', 'Track & Field'] # 构造示例数据 medals_data = [] for country in countries: for sport in sports: gold = np.random.randint(0, 10) # 金牌 silver = np.random.randint(0, 10) # 银牌 bronze = np.random.randint(0, 10) # 铜牌 total = gold + silver + bronze medals_data.append([country, sport, gold, silver, bronze, total]) df = pd.DataFrame(medals_data, columns=['Country','Sport','Gold','Silver','Bronze','Total']) # 教练对部分国家/项目的加成 coach_effect = { 'China': {'Volleyball': 3}, 'USA': {'Gymnastics': 4}, 'Romania': {'Gymnastics': 5} } # 奖牌数据加上教练加成 for country, effect_dict in coach_effect.items(): for sport, effect in effect_dict.items(): df.loc[(df['Country']==country)&(df['Sport']==sport), 'Gold'] += effect print("Updated Medal Data with Coach Effects:") print(df.head(15)) # 打印前15行查看 # --------让图表更平滑的关键:对 Sport 做整数映射-------- sports_sorted = sorted(sports) # 对项目按字母排序 sport_to_idx = {s: i for i, s in enumerate(sports_sorted)} # 在原 df 上添加 SportIndex 列 df['SportIndex'] = df['Sport'].map(sport_to_idx) # 排序后方便下面同时导出 Excel df.sort_values(['Country','SportIndex'], inplace=True) # 导出到 Excel 便于查看/复用 excel_output = "chart_data_coach_effect.xlsx" df.to_excel(excel_output, index=False) print(f"\nData used for chart saved to {excel_output}\n") # ----------绘图:x=SportIndex, y=Gold,按国家分别画----------- plt.figure(figsize=(10, 6)) for country in countries: cdata = df[df['Country']==country] plt.plot(cdata['SportIndex'], cdata['Gold'], marker='o', linewidth=2, label=country) plt.title("Gold Medals by Sport (with Coach Effect, Smoothed X-axis)", fontsize=14) plt.xlabel("Sport (Mapped to Index)", fontsize=12) plt.ylabel("Gold Medals", fontsize=12) # 设置 x 轴刻度:整数->项目名称 plt.xticks(range(len(sports_sorted)), sports_sorted, rotation=45) plt.legend(title="Country") plt.tight_layout() plt.show()

说明

  1. 导入顺序
    所需的 import 都集中在脚本最开头,后续所有代码段都能复用相同的依赖环境。

  2. 各部分相对独立

    • 第一大段(奥运奖牌预测)和“断轴”示例用的变量、数据集均不冲突,所以可以无缝拼接。
    • PART APART B 以及“教练加成”的示例数据,也都在不同的命名空间或使用了新的变量,不会对之前的预测分析造成影响。
    • 因此可以按顺序放在同一脚本中,互不干扰,各自输出结果也与原本完全一致。
  3. 可按需精简或拆分
    如果你只需要其中某些功能,可以自由注释掉不想执行的部分;但为了保证每段输出与原逻辑一致,这里保持了全部细节。

将上述脚本保存并运行后,即可在同一脚本中获取所有原先独立代码块的输出结果。

سؤال

1

الفكرة

Thought about code integration,持续 几秒

الإجابة

下面是第一段代码(奥运奖牌预测与进步退步分析)的完整内容。如果你只想要此部分,可将其独立保存/运行。与原先给出的脚本逻辑和输出保持一致。


python
import os import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import math from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error ############################################################### # 1) 第一段代码: 奥运奖牌预测 & 进步退步分析 ############################################################### # 读取 athletes & medal_counts athletes = pd.read_csv('summerOly_athletes.csv', encoding='ISO-8859-1') medal_counts = pd.read_csv('summerOly_medal_counts.csv', encoding='ISO-8859-1') # Merge & Preprocess medal_summary = medal_counts.groupby(['Year','NOC'], as_index=False).agg({ 'Gold':'sum','Silver':'sum','Bronze':'sum','Total':'sum' }) athlete_summary = athletes.groupby(['Year','NOC'], as_index=False).size().rename(columns={'size':'Athletes'}) data = pd.merge(medal_summary, athlete_summary, on=['Year','NOC'], how='left').fillna(0) # Rolling historical features data.sort_values(['NOC','Year'], inplace=True) data['Past_Gold_Avg'] = data.groupby('NOC')['Gold'].transform( lambda x: x.shift().rolling(window=3, min_periods=1).mean()) data['Past_Total_Avg'] = data.groupby('NOC')['Total'].transform( lambda x: x.shift().rolling(window=3, min_periods=1).mean()) data['Past_Gold_Sum'] = data.groupby('NOC')['Gold'].transform( lambda x: x.shift().rolling(window=3, min_periods=1).sum()) data['Past_Total_Sum'] = data.groupby('NOC')['Total'].transform( lambda x: x.shift().rolling(window=3, min_periods=1).sum()) data[['Past_Gold_Avg','Past_Total_Avg','Past_Gold_Sum','Past_Total_Sum']] = \ data[['Past_Gold_Avg','Past_Total_Avg','Past_Gold_Sum','Past_Total_Sum']].fillna(0) features = ['Past_Gold_Avg','Past_Total_Avg','Past_Gold_Sum','Past_Total_Sum'] target_gold = 'Gold' target_total = 'Total' def build_model(df, feats, target): X = df[feats] y = df[target] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 ) model = LinearRegression() model.fit(X_train, y_train) y_pred = model.predict(X_test) mse = mean_squared_error(y_test, y_pred) print(f"Model for {target}: MSE = {mse}") return model model_gold = build_model(data, features, target_gold) model_total = build_model(data, features, target_total) # Predict for 2028 unique_nocs = data['NOC'].unique() df_2024 = data[data['Year']==2024].copy() group_2024 = df_2024.groupby('NOC', as_index=False)[features].mean() future_data = pd.DataFrame({'NOC': unique_nocs}) future_data = pd.merge(future_data, group_2024, on='NOC', how='left').fillna(0) pred_gold = model_gold.predict(future_data[features]) pred_total = model_total.predict(future_data[features]) pred_gold = np.clip(pred_gold, a_min=0, a_max=None) pred_total = np.clip(pred_total, a_min=0, a_max=None) future_data['Predicted_Gold'] = pd.Series(pred_gold).round().astype(int) future_data['Predicted_Total'] = pd.Series(pred_total).round().astype(int) gold_res = model_gold.predict(data[features]) - data[target_gold] total_res = model_total.predict(data[features]) - data[target_total] gold_std = gold_res.std() total_std = total_res.std() future_data['Gold_Lower'] = (future_data['Predicted_Gold'] - 2*gold_std).clip(lower=0).round().astype(int) future_data['Gold_Upper'] = (future_data['Predicted_Gold'] + 2*gold_std).clip(lower=0).round().astype(int) future_data['Total_Lower'] = (future_data['Predicted_Total'] - 2*total_std).clip(lower=0).round().astype(int) future_data['Total_Upper'] = (future_data['Predicted_Total'] + 2*total_std).clip(lower=0).round().astype(int) future_data.sort_values(by='Predicted_Total', ascending=False, inplace=True) print("Predicted Medal Table for 2028 Los Angeles Olympics:") print( future_data[['NOC','Predicted_Gold','Gold_Lower','Gold_Upper', 'Predicted_Total','Total_Lower','Total_Upper']] ) future_data.to_csv('Predicted_Medal_Table_2028.csv', index=False) print("Results saved to 'Predicted_Medal_Table_2028.csv'") plt.figure(figsize=(12, 8)) sns.barplot(x='NOC', y='Predicted_Total', data=future_data.head(10), palette='viridis') plt.title('Predicted Total Medals in 2028 (Vertical Bars)') plt.xlabel('Country (NOC)') plt.ylabel('Predicted Total Medals') plt.xticks(rotation=45, ha='right') plt.tight_layout() plt.show() # 进步/退步分析 future_data['Gold_History'] = ( data.groupby('NOC')['Past_Gold_Avg'].mean() .reindex(unique_nocs).fillna(0).values ) future_data['Total_History'] = ( data.groupby('NOC')['Past_Total_Avg'].mean() .reindex(unique_nocs).fillna(0).values ) future_data['Gold_Progress'] = future_data['Predicted_Gold'] - future_data['Gold_History'] future_data['Total_Progress'] = future_data['Predicted_Total'] - future_data['Total_History'] future_data['Gold_Trend'] = np.where(future_data['Gold_Progress']>0, 'Progress','Decline') future_data['Total_Trend'] = np.where(future_data['Total_Progress']>0, 'Progress','Decline') progress_gold = future_data[['NOC','Predicted_Gold','Gold_Progress','Gold_Trend']]\ .sort_values('Gold_Progress', ascending=False) decline_gold = future_data[['NOC','Predicted_Gold','Gold_Progress','Gold_Trend']]\ .sort_values('Gold_Progress', ascending=True) progress_total = future_data[['NOC','Predicted_Total','Total_Progress','Total_Trend']]\ .sort_values('Total_Progress', ascending=False) decline_total = future_data[['NOC','Predicted_Total','Total_Progress','Total_Trend']]\ .sort_values('Total_Progress', ascending=True) print("\nCountries Most Likely to Improve in Gold:") print(progress_gold.head(10)) print("\nCountries Most Likely to Decline in Gold:") print(decline_gold.head(10)) print("\nCountries Most Likely to Improve in Total Medals:") print(progress_total.head(10)) print("\nCountries Most Likely to Decline in Total Medals:") print(decline_total.head(10)) # 画进步/退步图 plt.figure(figsize=(12, 8)) sns.barplot(x='NOC', y='Gold_Progress', data=progress_gold.head(10), palette='Blues') plt.title('Top 10 Countries Likely to Improve in Gold (Vertical Bars)') plt.xlabel('Country (NOC)') plt.ylabel('Gold Medal Progress') plt.xticks(rotation=45, ha='right') plt.tight_layout() plt.show() plt.figure(figsize=(12, 8)) sns.barplot(x='NOC', y='Gold_Progress', data=decline_gold.head(10), palette='Reds') plt.title('Top 10 Countries Likely to Decline in Gold (Vertical Bars)') plt.xlabel('Country (NOC)') plt.ylabel('Gold Medal Decline') plt.xticks(rotation=45, ha='right') plt.tight_layout() plt.show() plt.figure(figsize=(12, 8)) sns.barplot(x='NOC', y='Total_Progress', data=progress_total.head(10), palette='Blues') plt.title('Top 10 Countries Likely to Improve in Total Medals (Vertical Bars)') plt.xlabel('Country (NOC)') plt.ylabel('Total Medal Progress') plt.xticks(rotation=45, ha='right') plt.tight_layout() plt.show() plt.figure(figsize=(12, 8)) sns.barplot(x='NOC', y='Total_Progress', data=decline_total.head(10), palette='Reds') plt.title('Top 10 Countries Likely to Decline in Total Medals (Vertical Bars)') plt.xlabel('Country (NOC)') plt.ylabel('Total Medal Decline') plt.xticks(rotation=45, ha='right') plt.tight_layout() plt.show() # 金牌预测饼图 future_data_gold_sorted = future_data.sort_values('Predicted_Gold', ascending=False) top10_gold = future_data_gold_sorted.head(10) others_gold = future_data_gold_sorted.iloc[10:]['Predicted_Gold'].sum() labels = list(top10_gold['NOC'].values) + ['Others'] values = list(top10_gold['Predicted_Gold'].values) + [others_gold] top_colors = sns.color_palette('Set2', n_colors=len(top10_gold)) others_color = [(0.5,0.5,0.5)] colors = top_colors + others_color explode = [0]*(len(top10_gold)) + [0.1] plt.figure(figsize=(8,8)) plt.pie( values, labels=labels, colors=colors, explode=explode, autopct='%1.1f%%', startangle=160, wedgeprops={'alpha':0.9}, labeldistance=1.12, pctdistance=0.75 ) plt.title("Predicted Gold Share: Top 10 vs. Others (2D Pie)") plt.axis('equal') plt.tight_layout() plt.show()

要点说明

  1. 数据读取

    • 读取 summerOly_athletes.csvsummerOly_medal_counts.csv
    • 你需要提前将这些 CSV 文件放在运行脚本的同一目录下(或自行修改 pd.read_csv 的路径)。
  2. 建模与预测

    • 利用过去3届(窗口=3)的滚动均值、滚动总和作为特征。
    • 使用 LinearRegression 训练模型后,分别预测 2028 年金牌数量和总奖牌数,并给出区间范围(上下界)。
  3. 进步/退步分析

    • 以历史平均金牌(Past_Gold_Avg)或总奖牌(Past_Total_Avg)为基准,比较预测值和历史均值的差值,从而评估各国家可能的进步或退步幅度。
  4. 可视化

    • 多个柱状图与饼图,展示各国金牌预测、进步/退步榜单等。

如果后续还有任何疑问或需要更改的细节,欢迎再与我交流。

مشاركة هذا السؤال والجواب