奥运奖牌预测分析

Question

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

###############################################################

1) 第一段代码: 奥运奖牌预测 & 进步退步分析

###############################################################

读取 athletes & medal_counts

athletes = pd.read_csv('summerOly_athletes.csv', encoding='ISO-8859-1')
medal_counts = pd.read_csv('summerOly_medal_counts.csv', encoding='ISO-8859-1')

Merge & Preprocess

medal_summary = medal_counts.groupby(['Year','NOC'], as_index=False).agg({
'Gold':'sum','Silver':'sum','Bronze':'sum','Total':'sum'
})
athlete_summary = athletes.groupby(['Year','NOC'], as_index=False).size().rename(columns={'size':'Athletes'})
data = pd.merge(medal_summary, athlete_summary, on=['Year','NOC'], how='left').fillna(0)

Rolling historical features

data.sort_values(['NOC','Year'], inplace=True)
data['Past_Gold_Avg'] = data.groupby('NOC')['Gold'].transform(
lambda x: x.shift().rolling(window=3, min_periods=1).mean())
data['Past_Total_Avg'] = data.groupby('NOC')['Total'].transform(
lambda x: x.shift().rolling(window=3, min_periods=1).mean())
data['Past_Gold_Sum'] = data.groupby('NOC')['Gold'].transform(
lambda x: x.shift().rolling(window=3, min_periods=1).sum())
data['Past_Total_Sum'] = data.groupby('NOC')['Total'].transform(
lambda x: x.shift().rolling(window=3, min_periods=1).sum())
data[['Past_Gold_Avg','Past_Total_Avg','Past_Gold_Sum','Past_Total_Sum']] =
data[['Past_Gold_Avg','Past_Total_Avg','Past_Gold_Sum','Past_Total_Sum']].fillna(0)

features = ['Past_Gold_Avg','Past_Total_Avg','Past_Gold_Sum','Past_Total_Sum']
target_gold = 'Gold'
target_total = 'Total'

def build_model(df, feats, target):
X = df[feats]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2,
random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Model for {target}: MSE = {mse}")
return model

model_gold = build_model(data, features, target_gold)
model_total = build_model(data, features, target_total)

Predict for 2028

unique_nocs = data['NOC'].unique()
df_2024 = data[data['Year']==2024].copy()
group_2024 = df_2024.groupby('NOC', as_index=False)[features].mean()

future_data = pd.DataFrame({'NOC': unique_nocs})
future_data = pd.merge(future_data, group_2024, on='NOC', how='left').fillna(0)

pred_gold = model_gold.predict(future_data[features])
pred_total = model_total.predict(future_data[features])

pred_gold = np.clip(pred_gold, a_min=0, a_max=None)
pred_total = np.clip(pred_total, a_min=0, a_max=None)

future_data['Predicted_Gold'] = pd.Series(pred_gold).round().astype(int)
future_data['Predicted_Total'] = pd.Series(pred_total).round().astype(int)

gold_res = model_gold.predict(data[features]) - data[target_gold]
total_res = model_total.predict(data[features]) - data[target_total]
gold_std = gold_res.std()
total_std = total_res.std()

future_data['Gold_Lower'] = (future_data['Predicted_Gold'] - 2gold_std).clip(lower=0).round().astype(int)
future_data['Gold_Upper'] = (future_data['Predicted_Gold'] + 2gold_std).clip(lower=0).round().astype(int)
future_data['Total_Lower'] = (future_data['Predicted_Total'] - 2total_std).clip(lower=0).round().astype(int)
future_data['Total_Upper'] = (future_data['Predicted_Total'] + 2total_std).clip(lower=0).round().astype(int)

future_data.sort_values(by='Predicted_Total', ascending=False, inplace=True)
print("Predicted Medal Table for 2028 Los Angeles Olympics:")
print(
future_data[['NOC','Predicted_Gold','Gold_Lower','Gold_Upper',
'Predicted_Total','Total_Lower','Total_Upper']]
)

future_data.to_csv('Predicted_Medal_Table_2028.csv', index=False)
print("Results saved to 'Predicted_Medal_Table_2028.csv'")

plt.figure(figsize=(12, 8))
sns.barplot(x='NOC', y='Predicted_Total', data=future_data.head(10), palette='viridis')
plt.title('Predicted Total Medals in 2028 (Vertical Bars)')
plt.xlabel('Country (NOC)')
plt.ylabel('Predicted Total Medals')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

future_data['Gold_History'] = data.groupby('NOC')['Past_Gold_Avg'].mean().reindex(unique_nocs).fillna(0).values
future_data['Total_History'] = data.groupby('NOC')['Past_Total_Avg'].mean().reindex(unique_nocs).fillna(0).values

future_data['Gold_Progress'] = future_data['Predicted_Gold'] - future_data['Gold_History']
future_data['Total_Progress'] = future_data['Predicted_Total'] - future_data['Total_History']
future_data['Gold_Trend'] = np.where(future_data['Gold_Progress']>0, 'Progress','Decline')
future_data['Total_Trend'] = np.where(future_data['Total_Progress']>0, 'Progress','Decline')

progress_gold = future_data[['NOC','Predicted_Gold','Gold_Progress','Gold_Trend']]
.sort_values('Gold_Progress', ascending=False)
decline_gold = future_data[['NOC','Predicted_Gold','Gold_Progress','Gold_Trend']]
.sort_values('Gold_Progress', ascending=True)
progress_total = future_data[['NOC','Predicted_Total','Total_Progress','Total_Trend']]
.sort_values('Total_Progress', ascending=False)
decline_total = future_data[['NOC','Predicted_Total','Total_Progress','Total_Trend']]
.sort_values('Total_Progress', ascending=True)

print("\nCountries Most Likely to Improve in Gold:")
print(progress_gold.head(10))
print("\nCountries Most Likely to Decline in Gold:")
print(decline_gold.head(10))
print("\nCountries Most Likely to Improve in Total Medals:")
print(progress_total.head(10))
print("\nCountries Most Likely to Decline in Total Medals:")
print(decline_total.head(10))

plt.figure(figsize=(12, 8))
sns.barplot(x='NOC', y='Gold_Progress', data=progress_gold.head(10), palette='Blues')
plt.title('Top 10 Countries Likely to Improve in Gold (Vertical Bars)')
plt.xlabel('Country (NOC)')
plt.ylabel('Gold Medal Progress')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 8))
sns.barplot(x='NOC', y='Gold_Progress', data=decline_gold.head(10), palette='Reds')
plt.title('Top 10 Countries Likely to Decline in Gold (Vertical Bars)')
plt.xlabel('Country (NOC)')
plt.ylabel('Gold Medal Decline')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 8))
sns.barplot(x='NOC', y='Total_Progress', data=progress_total.head(10), palette='Blues')
plt.title('Top 10 Countries Likely to Improve in Total Medals (Vertical Bars)')
plt.xlabel('Country (NOC)')
plt.ylabel('Total Medal Progress')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 8))
sns.barplot(x='NOC', y='Total_Progress', data=decline_total.head(10), palette='Reds')
plt.title('Top 10 Countries Likely to Decline in Total Medals (Vertical Bars)')
plt.xlabel('Country (NOC)')
plt.ylabel('Total Medal Decline')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

future_data_gold_sorted = future_data.sort_values('Predicted_Gold', ascending=False)
top10_gold = future_data_gold_sorted.head(10)
others_gold = future_data_gold_sorted.iloc[10:]['Predicted_Gold'].sum()
labels = list(top10_gold['NOC'].values) + ['Others']
values = list(top10_gold['Predicted_Gold'].values) + [others_gold]

top_colors = sns.color_palette('Set2', n_colors=len(top10_gold))
others_color = [(0.5,0.5,0.5)]
colors = top_colors + others_color
explode = [0]*(len(top10_gold)) + [0.1]

plt.figure(figsize=(8,8))
plt.pie(
values,
labels=labels,
colors=colors,
explode=explode,
autopct='%1.1f%%',
startangle=160,
wedgeprops={'alpha':0.9},
labeldistance=1.12,
pctdistance=0.75
)
plt.title("Predicted Gold Share: Top 10 vs. Others (2D Pie)")
plt.axis('equal')
plt.tight_layout()
plt.show()

###############################################################

2) 第二段代码: “断轴”示例 (保留原逻辑 + 输出)

###############################################################
countries = ["Yugoslavia","Turkey","Croatia","Canada","Costa Rica",
"Latvia","Indonesia","Nigeria","Kyrgyzstan","Finland"]
values = [0.67,0.65,0.52,0.50,0.49,0.48,0.48,0.47,0.47,0.46]
gold_prob_mod = pd.Series(data=values, index=countries)

x_labels = gold_prob_mod.index
x_vals = np.arange(len(x_labels))

fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(8,6))
fig.subplots_adjust(hspace=0.05)

ax1.bar(x_vals, gold_prob_mod, color='skyblue', edgecolor='black')
ax1.set_ylim(0.45, 0.7)
ax1.spines['bottom'].set_visible(False)
ax1.tick_params(labelbottom=False)

ax2.bar(x_vals, gold_prob_mod, color='skyblue', edgecolor='black')
ax2.set_ylim(0, 0.4)

d = .01
kwargs = dict(transform=ax1.transAxes, color='k', clip_on=False)
ax1.plot((-d, +d), (-d, +d), **kwargs)
ax1.plot((1 - d, 1 + d), (-d, +d), **kwargs)

kwargs = dict(transform=ax2.transAxes, color='k', clip_on=False)
ax2.plot((-d, +d), (1 - d, 1 + d), **kwargs)
ax2.plot((1 - d, 1 + d), (1 - d, 1 + d), **kwargs)

ax2.set_xticks(x_vals)
ax2.set_xticklabels(x_labels, rotation=45, ha='right')
ax2.set_xlabel("Country (NOC)")
ax2.set_ylabel("Probability of 'trend>0'")
ax1.set_title("Top 10 Zero-Gold Countries (Broken Y-Axis)")

plt.tight_layout()
plt.show()

###############################################################

========== PART A: 生成 yearly_summary并画两张图 ==========

###############################################################
base_dir = r"D:\浏览器下载\pycharm\olympic\my_olympic_project.venv"
programs_file_path = os.path.join(base_dir, 'summerOly_programs.csv')
medal_counts_file_path = os.path.join(base_dir, 'summerOly_medal_counts.csv')
output_excel = os.path.join(base_dir, 'yearly_summary.xlsx')

programs_df = pd.read_csv(programs_file_path, encoding='ISO-8859-1')
medal_counts_df = pd.read_csv(medal_counts_file_path, encoding='ISO-8859-1')

programs_df_cleaned = programs_df.loc[:, ~programs_df.columns.str.contains("Code|Sport|Discipline|Sports Governing Body")]
programs_df_cleaned = programs_df_cleaned.rename(columns=lambda x: x.strip().replace('*','') if isinstance(x, str) else x)
programs_df_cleaned = programs_df_cleaned.apply(pd.to_numeric, errors='coerce')

programs_melted_cleaned = programs_df_cleaned.melt(var_name='Year', value_name='Number_of_Events')
programs_melted_cleaned['Year'] = pd.to_numeric(programs_melted_cleaned['Year'], errors='coerce', downcast='integer')

medal_counts_per_year = medal_counts_df.groupby(['Year','NOC'])['Gold'].sum().reset_index()
merged_data_cleaned = pd.merge(medal_counts_per_year, programs_melted_cleaned, how='left', on='Year')

yearly_summary = merged_data_cleaned.groupby('Year').agg({
'Number_of_Events':'sum',
'Gold':'sum'
}).reset_index()

yearly_summary['Gold_to_Event_Ratio'] = yearly_summary['Gold'] / yearly_summary['Number_of_Events']
yearly_summary['Gold_to_Event_Ratio'] = yearly_summary['Gold_to_Event_Ratio'].fillna(0)

yearly_summary.to_excel(output_excel, index=False)
print(f"yearly_summary 已保存至: {output_excel}")
print(yearly_summary.head(10))

plt.figure(figsize=(12, 6))
plt.plot(yearly_summary['Year'], yearly_summary['Gold_to_Event_Ratio'], marker='o', color='purple')
plt.title("Gold to Event Ratio per Year (1896-2024)", fontsize=14)
plt.xlabel("Year", fontsize=12)
plt.ylabel("Gold to Event Ratio", fontsize=12)
plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 6))
plt.plot(yearly_summary['Year'], yearly_summary['Number_of_Events'], label="Number of Events", marker='o', color='b')
plt.plot(yearly_summary['Year'], yearly_summary['Gold'], label="Gold Medals", marker='o', color='g')
plt.title("Relationship between Number of Events and Gold Medal Counts (1896-2024)", fontsize=14)
plt.xlabel("Year", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.legend(title="Legend")
plt.tight_layout()
plt.show()

###############################################################

========== PART B: 生成“平滑”模拟数据并画折线图 ==========

###############################################################
N = 20
x_start = 10
x_vals = np.arange(x_start, x_start + N)
gold_list = []

rng = np.random.default_rng(2028)
for x in x_vals:
base = 2*x + 10
wave = math.sin(x/2)*3
y = round(base + wave)
gold_list.append(y)

df_line = pd.DataFrame({
'Number_of_Events': x_vals,
'Gold': gold_list
})

df_line.sort_values('Number_of_Events', inplace=True)
excel_name = "smooth_line_data_no_host.xlsx"
df_line.to_excel(excel_name, index=False)
print(f"Data has been saved to {excel_name}")
print(df_line.head(10))

plt.figure(figsize=(10,6))
plt.plot(
df_line['Number_of_Events'],
df_line['Gold'],
marker='o', color='blue', linewidth=2
)
plt.title("Smooth Relationship: Number_of_Events vs. Gold (No Host Column)", fontsize=14)
plt.xlabel("Number_of_Events (integer)", fontsize=12)
plt.ylabel("Gold (simulated)", fontsize=12)
plt.xticks(df_line['Number_of_Events'])
plt.tight_layout()
plt.show()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

固定随机种子，方便复现

np.random.seed(42)

国家与项目列表

countries = ['China', 'USA', 'Romania', 'Japan', 'Russia', 'Germany', 'Brazil']
sports = ['Volleyball', 'Gymnastics', 'Basketball', 'Football', 'Tennis', 'Swimming', 'Track & Field']

构造示例数据

medals_data = []
for country in countries:
for sport in sports:
gold = np.random.randint(0, 10) # 金牌
silver = np.random.randint(0, 10) # 银牌
bronze = np.random.randint(0, 10) # 铜牌
total = gold + silver + bronze
medals_data.append([country, sport, gold, silver, bronze, total])

df = pd.DataFrame(medals_data, columns=['Country','Sport','Gold','Silver','Bronze','Total'])

教练对部分国家/项目的加成

coach_effect = {
'China': {'Volleyball': 3},
'USA': {'Gymnastics': 4},
'Romania': {'Gymnastics': 5}
}

奖牌数据加上教练加成

for country, effect_dict in coach_effect.items():
for sport, effect in effect_dict.items():
df.loc[(df['Country']==country)&(df['Sport']==sport), 'Gold'] += effect

print("Updated Medal Data with Coach Effects:")
print(df.head(15)) # 打印前15行查看

--------让图表更平滑的关键：对 Sport 做整数映射--------

sports_sorted = sorted(sports) # 对项目按字母排序
sport_to_idx = {s: i for i, s in enumerate(sports_sorted)}

在原 df 上添加 SportIndex 列

df['SportIndex'] = df['Sport'].map(sport_to_idx)

排序后方便下面同时导出 Excel

df.sort_values(['Country','SportIndex'], inplace=True)

导出到 Excel 便于查看/复用

excel_output = "chart_data_coach_effect.xlsx"
df.to_excel(excel_output, index=False)
print(f"\nData used for chart saved to {excel_output}\n")

----------绘图：x=SportIndex, y=Gold，按国家分别画-----------

plt.figure(figsize=(10, 6))
for country in countries:
# 过滤该国数据
cdata = df[df['Country']==country]
# 画折线
plt.plot(cdata['SportIndex'], cdata['Gold'], marker='o', linewidth=2, label=country)

plt.title("Gold Medals by Sport (with Coach Effect, Smoothed X-axis)", fontsize=14)
plt.xlabel("Sport (Mapped to Index)", fontsize=12)
plt.ylabel("Gold Medals", fontsize=12)

设置 x 轴刻度：整数->项目名称

plt.xticks(range(len(sports_sorted)), sports_sorted, rotation=45)
plt.legend(title="Country")
plt.tight_layout()
plt.show()请你整合这两段代码，保证他们的各自输出的结果不变

Answer 1

下面给出一份将所有代码按原顺序整合在一起的示例脚本。这样做可以确保每个部分的逻辑和输出都与原本相同，互不干扰。你可以直接将此脚本保存为一个 .py 文件运行，或者在交互式环境（Jupyter/VSCode/Spyder等）中逐段执行。

python
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

##########################################################################
#             1) 第一段代码: 奥运奖牌预测 & 进步退步分析
##########################################################################

# 读取 athletes & medal_counts
athletes = pd.read_csv('summerOly_athletes.csv', encoding='ISO-8859-1')
medal_counts = pd.read_csv('summerOly_medal_counts.csv', encoding='ISO-8859-1')

# Merge & Preprocess
medal_summary = medal_counts.groupby(['Year','NOC'], as_index=False).agg({
    'Gold':'sum','Silver':'sum','Bronze':'sum','Total':'sum'
})
athlete_summary = athletes.groupby(['Year','NOC'], as_index=False).size().rename(columns={'size':'Athletes'})
data = pd.merge(medal_summary, athlete_summary, on=['Year','NOC'], how='left').fillna(0)

# Rolling historical features
data.sort_values(['NOC','Year'], inplace=True)
data['Past_Gold_Avg'] = data.groupby('NOC')['Gold'].transform(
    lambda x: x.shift().rolling(window=3, min_periods=1).mean())
data['Past_Total_Avg'] = data.groupby('NOC')['Total'].transform(
    lambda x: x.shift().rolling(window=3, min_periods=1).mean())
data['Past_Gold_Sum'] = data.groupby('NOC')['Gold'].transform(
    lambda x: x.shift().rolling(window=3, min_periods=1).sum())
data['Past_Total_Sum'] = data.groupby('NOC')['Total'].transform(
    lambda x: x.shift().rolling(window=3, min_periods=1).sum())
data[['Past_Gold_Avg','Past_Total_Avg','Past_Gold_Sum','Past_Total_Sum']] = \
    data[['Past_Gold_Avg','Past_Total_Avg','Past_Gold_Sum','Past_Total_Sum']].fillna(0)

features = ['Past_Gold_Avg','Past_Total_Avg','Past_Gold_Sum','Past_Total_Sum']
target_gold = 'Gold'
target_total = 'Total'

def build_model(df, feats, target):
    X = df[feats]
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.2,
                                                        random_state=42)
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f"Model for {target}: MSE = {mse}")
    return model

model_gold = build_model(data, features, target_gold)
model_total = build_model(data, features, target_total)

# Predict for 2028
unique_nocs = data['NOC'].unique()
df_2024 = data[data['Year']==2024].copy()
group_2024 = df_2024.groupby('NOC', as_index=False)[features].mean()

future_data = pd.DataFrame({'NOC': unique_nocs})
future_data = pd.merge(future_data, group_2024, on='NOC', how='left').fillna(0)

pred_gold = model_gold.predict(future_data[features])
pred_total = model_total.predict(future_data[features])

pred_gold = np.clip(pred_gold, a_min=0, a_max=None)
pred_total = np.clip(pred_total, a_min=0, a_max=None)

future_data['Predicted_Gold'] = pd.Series(pred_gold).round().astype(int)
future_data['Predicted_Total'] = pd.Series(pred_total).round().astype(int)

gold_res = model_gold.predict(data[features]) - data[target_gold]
total_res = model_total.predict(data[features]) - data[target_total]
gold_std = gold_res.std()
total_std = total_res.std()

future_data['Gold_Lower'] = (future_data['Predicted_Gold'] - 2*gold_std).clip(lower=0).round().astype(int)
future_data['Gold_Upper'] = (future_data['Predicted_Gold'] + 2*gold_std).clip(lower=0).round().astype(int)
future_data['Total_Lower'] = (future_data['Predicted_Total'] - 2*total_std).clip(lower=0).round().astype(int)
future_data['Total_Upper'] = (future_data['Predicted_Total'] + 2*total_std).clip(lower=0).round().astype(int)

future_data.sort_values(by='Predicted_Total', ascending=False, inplace=True)
print("Predicted Medal Table for 2028 Los Angeles Olympics:")
print(
    future_data[['NOC','Predicted_Gold','Gold_Lower','Gold_Upper',
                 'Predicted_Total','Total_Lower','Total_Upper']]
)

future_data.to_csv('Predicted_Medal_Table_2028.csv', index=False)
print("Results saved to 'Predicted_Medal_Table_2028.csv'")

plt.figure(figsize=(12, 8))
sns.barplot(x='NOC', y='Predicted_Total', data=future_data.head(10), palette='viridis')
plt.title('Predicted Total Medals in 2028 (Vertical Bars)')
plt.xlabel('Country (NOC)')
plt.ylabel('Predicted Total Medals')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

future_data['Gold_History'] = data.groupby('NOC')['Past_Gold_Avg'].mean().reindex(unique_nocs).fillna(0).values
future_data['Total_History'] = data.groupby('NOC')['Past_Total_Avg'].mean().reindex(unique_nocs).fillna(0).values

future_data['Gold_Progress'] = future_data['Predicted_Gold'] - future_data['Gold_History']
future_data['Total_Progress'] = future_data['Predicted_Total'] - future_data['Total_History']
future_data['Gold_Trend'] = np.where(future_data['Gold_Progress']>0, 'Progress','Decline')
future_data['Total_Trend'] = np.where(future_data['Total_Progress']>0, 'Progress','Decline')

progress_gold = future_data[['NOC','Predicted_Gold','Gold_Progress','Gold_Trend']]\
                .sort_values('Gold_Progress', ascending=False)
decline_gold = future_data[['NOC','Predicted_Gold','Gold_Progress','Gold_Trend']]\
               .sort_values('Gold_Progress', ascending=True)
progress_total = future_data[['NOC','Predicted_Total','Total_Progress','Total_Trend']]\
                 .sort_values('Total_Progress', ascending=False)
decline_total = future_data[['NOC','Predicted_Total','Total_Progress','Total_Trend']]\
                .sort_values('Total_Progress', ascending=True)

print("\nCountries Most Likely to Improve in Gold:")
print(progress_gold.head(10))
print("\nCountries Most Likely to Decline in Gold:")
print(decline_gold.head(10))
print("\nCountries Most Likely to Improve in Total Medals:")
print(progress_total.head(10))
print("\nCountries Most Likely to Decline in Total Medals:")
print(decline_total.head(10))

plt.figure(figsize=(12, 8))
sns.barplot(x='NOC', y='Gold_Progress', data=progress_gold.head(10), palette='Blues')
plt.title('Top 10 Countries Likely to Improve in Gold (Vertical Bars)')
plt.xlabel('Country (NOC)')
plt.ylabel('Gold Medal Progress')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 8))
sns.barplot(x='NOC', y='Gold_Progress', data=decline_gold.head(10), palette='Reds')
plt.title('Top 10 Countries Likely to Decline in Gold (Vertical Bars)')
plt.xlabel('Country (NOC)')
plt.ylabel('Gold Medal Decline')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 8))
sns.barplot(x='NOC', y='Total_Progress', data=progress_total.head(10), palette='Blues')
plt.title('Top 10 Countries Likely to Improve in Total Medals (Vertical Bars)')
plt.xlabel('Country (NOC)')
plt.ylabel('Total Medal Progress')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 8))
sns.barplot(x='NOC', y='Total_Progress', data=decline_total.head(10), palette='Reds')
plt.title('Top 10 Countries Likely to Decline in Total Medals (Vertical Bars)')
plt.xlabel('Country (NOC)')
plt.ylabel('Total Medal Decline')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

future_data_gold_sorted = future_data.sort_values('Predicted_Gold', ascending=False)
top10_gold = future_data_gold_sorted.head(10)
others_gold = future_data_gold_sorted.iloc[10:]['Predicted_Gold'].sum()
labels = list(top10_gold['NOC'].values) + ['Others']
values = list(top10_gold['Predicted_Gold'].values) + [others_gold]

top_colors = sns.color_palette('Set2', n_colors=len(top10_gold))
others_color = [(0.5,0.5,0.5)]
colors = top_colors + others_color
explode = [0]*(len(top10_gold)) + [0.1]

plt.figure(figsize=(8,8))
plt.pie(
    values,
    labels=labels,
    colors=colors,
    explode=explode,
    autopct='%1.1f%%',
    startangle=160,
    wedgeprops={'alpha':0.9},
    labeldistance=1.12,
    pctdistance=0.75
)
plt.title("Predicted Gold Share: Top 10 vs. Others (2D Pie)")
plt.axis('equal')
plt.tight_layout()
plt.show()


##########################################################################
#             2) 第二段代码: “断轴”示例 (保留原逻辑 + 输出)
##########################################################################

countries = ["Yugoslavia","Turkey","Croatia","Canada","Costa Rica",
             "Latvia","Indonesia","Nigeria","Kyrgyzstan","Finland"]
values = [0.67,0.65,0.52,0.50,0.49,0.48,0.48,0.47,0.47,0.46]
gold_prob_mod = pd.Series(data=values, index=countries)

x_labels = gold_prob_mod.index
x_vals = np.arange(len(x_labels))

fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(8,6))
fig.subplots_adjust(hspace=0.05)

ax1.bar(x_vals, gold_prob_mod, color='skyblue', edgecolor='black')
ax1.set_ylim(0.45, 0.7)
ax1.spines['bottom'].set_visible(False)
ax1.tick_params(labelbottom=False)

ax2.bar(x_vals, gold_prob_mod, color='skyblue', edgecolor='black')
ax2.set_ylim(0, 0.4)

d = .01
kwargs = dict(transform=ax1.transAxes, color='k', clip_on=False)
ax1.plot((-d, +d), (-d, +d), **kwargs)
ax1.plot((1 - d, 1 + d), (-d, +d), **kwargs)

kwargs = dict(transform=ax2.transAxes, color='k', clip_on=False)
ax2.plot((-d, +d), (1 - d, 1 + d), **kwargs)
ax2.plot((1 - d, 1 + d), (1 - d, 1 + d), **kwargs)

ax2.set_xticks(x_vals)
ax2.set_xticklabels(x_labels, rotation=45, ha='right')
ax2.set_xlabel("Country (NOC)")
ax2.set_ylabel("Probability of 'trend>0'")
ax1.set_title("Top 10 Zero-Gold Countries (Broken Y-Axis)")

plt.tight_layout()
plt.show()


##########################################################################
# ========== PART A: 生成 yearly_summary 并画两张图 ==========
##########################################################################

base_dir = r"D:\浏览器下载\pycharm\olympic\my_olympic_project\.venv"
programs_file_path = os.path.join(base_dir, 'summerOly_programs.csv')
medal_counts_file_path = os.path.join(base_dir, 'summerOly_medal_counts.csv')
output_excel = os.path.join(base_dir, 'yearly_summary.xlsx')

programs_df = pd.read_csv(programs_file_path, encoding='ISO-8859-1')
medal_counts_df = pd.read_csv(medal_counts_file_path, encoding='ISO-8859-1')

programs_df_cleaned = programs_df.loc[:, ~programs_df.columns.str.contains("Code|Sport|Discipline|Sports Governing Body")]
programs_df_cleaned = programs_df_cleaned.rename(columns=lambda x: x.strip().replace('*','') if isinstance(x, str) else x)
programs_df_cleaned = programs_df_cleaned.apply(pd.to_numeric, errors='coerce')

programs_melted_cleaned = programs_df_cleaned.melt(var_name='Year', value_name='Number_of_Events')
programs_melted_cleaned['Year'] = pd.to_numeric(programs_melted_cleaned['Year'], errors='coerce', downcast='integer')

medal_counts_per_year = medal_counts_df.groupby(['Year','NOC'])['Gold'].sum().reset_index()
merged_data_cleaned = pd.merge(medal_counts_per_year, programs_melted_cleaned, how='left', on='Year')

yearly_summary = merged_data_cleaned.groupby('Year').agg({
    'Number_of_Events':'sum',
    'Gold':'sum'
}).reset_index()

yearly_summary['Gold_to_Event_Ratio'] = yearly_summary['Gold'] / yearly_summary['Number_of_Events']
yearly_summary['Gold_to_Event_Ratio'] = yearly_summary['Gold_to_Event_Ratio'].fillna(0)

yearly_summary.to_excel(output_excel, index=False)
print(f"yearly_summary 已保存至: {output_excel}")
print(yearly_summary.head(10))

plt.figure(figsize=(12, 6))
plt.plot(yearly_summary['Year'], yearly_summary['Gold_to_Event_Ratio'], marker='o', color='purple')
plt.title("Gold to Event Ratio per Year (1896-2024)", fontsize=14)
plt.xlabel("Year", fontsize=12)
plt.ylabel("Gold to Event Ratio", fontsize=12)
plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 6))
plt.plot(yearly_summary['Year'], yearly_summary['Number_of_Events'], label="Number of Events", marker='o', color='b')
plt.plot(yearly_summary['Year'], yearly_summary['Gold'], label="Gold Medals", marker='o', color='g')
plt.title("Relationship between Number of Events and Gold Medal Counts (1896-2024)", fontsize=14)
plt.xlabel("Year", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.legend(title="Legend")
plt.tight_layout()
plt.show()


##########################################################################
# ========== PART B: 生成“平滑”模拟数据并画折线图 ==========
##########################################################################

N = 20
x_start = 10
x_vals = np.arange(x_start, x_start + N)
gold_list = []

rng = np.random.default_rng(2028)
for x in x_vals:
    base = 2*x + 10
    wave = math.sin(x/2)*3
    y = round(base + wave)
    gold_list.append(y)

df_line = pd.DataFrame({
    'Number_of_Events': x_vals,
    'Gold': gold_list
})

df_line.sort_values('Number_of_Events', inplace=True)
excel_name = "smooth_line_data_no_host.xlsx"
df_line.to_excel(excel_name, index=False)
print(f"Data has been saved to {excel_name}")
print(df_line.head(10))

plt.figure(figsize=(10,6))
plt.plot(
    df_line['Number_of_Events'],
    df_line['Gold'],
    marker='o', color='blue', linewidth=2
)
plt.title("Smooth Relationship: Number_of_Events vs. Gold (No Host Column)", fontsize=14)
plt.xlabel("Number_of_Events (integer)", fontsize=12)
plt.ylabel("Gold (simulated)", fontsize=12)
plt.xticks(df_line['Number_of_Events'])
plt.tight_layout()
plt.show()


##########################################################################
# ========== 示范：加入“教练加成”效果的折线图（示例数据） ==========
##########################################################################

# 固定随机种子，方便复现
np.random.seed(42)

# 国家与项目列表
countries = ['China', 'USA', 'Romania', 'Japan', 'Russia', 'Germany', 'Brazil']
sports = ['Volleyball', 'Gymnastics', 'Basketball', 'Football', 'Tennis', 'Swimming', 'Track & Field']

# 构造示例数据
medals_data = []
for country in countries:
    for sport in sports:
        gold = np.random.randint(0, 10)    # 金牌
        silver = np.random.randint(0, 10)  # 银牌
        bronze = np.random.randint(0, 10)  # 铜牌
        total = gold + silver + bronze
        medals_data.append([country, sport, gold, silver, bronze, total])

df = pd.DataFrame(medals_data, columns=['Country','Sport','Gold','Silver','Bronze','Total'])

# 教练对部分国家/项目的加成
coach_effect = {
    'China':   {'Volleyball': 3},
    'USA':     {'Gymnastics': 4},
    'Romania': {'Gymnastics': 5}
}

# 奖牌数据加上教练加成
for country, effect_dict in coach_effect.items():
    for sport, effect in effect_dict.items():
        df.loc[(df['Country']==country)&(df['Sport']==sport), 'Gold'] += effect

print("Updated Medal Data with Coach Effects:")
print(df.head(15))  # 打印前15行查看

# --------让图表更平滑的关键：对 Sport 做整数映射--------
sports_sorted = sorted(sports)                          # 对项目按字母排序
sport_to_idx = {s: i for i, s in enumerate(sports_sorted)}

# 在原 df 上添加 SportIndex 列
df['SportIndex'] = df['Sport'].map(sport_to_idx)
# 排序后方便下面同时导出 Excel
df.sort_values(['Country','SportIndex'], inplace=True)

# 导出到 Excel 便于查看/复用
excel_output = "chart_data_coach_effect.xlsx"
df.to_excel(excel_output, index=False)
print(f"\nData used for chart saved to {excel_output}\n")

# ----------绘图：x=SportIndex, y=Gold，按国家分别画-----------
plt.figure(figsize=(10, 6))
for country in countries:
    cdata = df[df['Country']==country]
    plt.plot(cdata['SportIndex'], cdata['Gold'], marker='o', linewidth=2, label=country)

plt.title("Gold Medals by Sport (with Coach Effect, Smoothed X-axis)", fontsize=14)
plt.xlabel("Sport (Mapped to Index)", fontsize=12)
plt.ylabel("Gold Medals", fontsize=12)

# 设置 x 轴刻度：整数->项目名称
plt.xticks(range(len(sports_sorted)), sports_sorted, rotation=45)
plt.legend(title="Country")
plt.tight_layout()
plt.show()

说明

导入顺序
所需的 import 都集中在脚本最开头，后续所有代码段都能复用相同的依赖环境。
各部分相对独立
- 第一大段（奥运奖牌预测）和“断轴”示例用的变量、数据集均不冲突，所以可以无缝拼接。
- PART A、PART B 以及“教练加成”的示例数据，也都在不同的命名空间或使用了新的变量，不会对之前的预测分析造成影响。
- 因此可以按顺序放在同一脚本中，互不干扰，各自输出结果也与原本完全一致。
可按需精简或拆分
如果你只需要其中某些功能，可以自由注释掉不想执行的部分；但为了保证每段输出与原逻辑一致，这里保持了全部细节。

将上述脚本保存并运行后，即可在同一脚本中获取所有原先独立代码块的输出结果。

Answer 2

下面是第一段代码（奥运奖牌预测与进步退步分析）的完整内容。如果你只想要此部分，可将其独立保存/运行。与原先给出的脚本逻辑和输出保持一致。

python
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

###############################################################
#  1) 第一段代码: 奥运奖牌预测 & 进步退步分析
###############################################################

# 读取 athletes & medal_counts
athletes = pd.read_csv('summerOly_athletes.csv', encoding='ISO-8859-1')
medal_counts = pd.read_csv('summerOly_medal_counts.csv', encoding='ISO-8859-1')

# Merge & Preprocess
medal_summary = medal_counts.groupby(['Year','NOC'], as_index=False).agg({
    'Gold':'sum','Silver':'sum','Bronze':'sum','Total':'sum'
})
athlete_summary = athletes.groupby(['Year','NOC'], as_index=False).size().rename(columns={'size':'Athletes'})
data = pd.merge(medal_summary, athlete_summary, on=['Year','NOC'], how='left').fillna(0)

# Rolling historical features
data.sort_values(['NOC','Year'], inplace=True)
data['Past_Gold_Avg'] = data.groupby('NOC')['Gold'].transform(
    lambda x: x.shift().rolling(window=3, min_periods=1).mean())
data['Past_Total_Avg'] = data.groupby('NOC')['Total'].transform(
    lambda x: x.shift().rolling(window=3, min_periods=1).mean())
data['Past_Gold_Sum'] = data.groupby('NOC')['Gold'].transform(
    lambda x: x.shift().rolling(window=3, min_periods=1).sum())
data['Past_Total_Sum'] = data.groupby('NOC')['Total'].transform(
    lambda x: x.shift().rolling(window=3, min_periods=1).sum())
data[['Past_Gold_Avg','Past_Total_Avg','Past_Gold_Sum','Past_Total_Sum']] = \
    data[['Past_Gold_Avg','Past_Total_Avg','Past_Gold_Sum','Past_Total_Sum']].fillna(0)

features = ['Past_Gold_Avg','Past_Total_Avg','Past_Gold_Sum','Past_Total_Sum']
target_gold = 'Gold'
target_total = 'Total'

def build_model(df, feats, target):
    X = df[feats]
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f"Model for {target}: MSE = {mse}")
    return model

model_gold = build_model(data, features, target_gold)
model_total = build_model(data, features, target_total)

# Predict for 2028
unique_nocs = data['NOC'].unique()
df_2024 = data[data['Year']==2024].copy()
group_2024 = df_2024.groupby('NOC', as_index=False)[features].mean()

future_data = pd.DataFrame({'NOC': unique_nocs})
future_data = pd.merge(future_data, group_2024, on='NOC', how='left').fillna(0)

pred_gold = model_gold.predict(future_data[features])
pred_total = model_total.predict(future_data[features])

pred_gold = np.clip(pred_gold, a_min=0, a_max=None)
pred_total = np.clip(pred_total, a_min=0, a_max=None)

future_data['Predicted_Gold'] = pd.Series(pred_gold).round().astype(int)
future_data['Predicted_Total'] = pd.Series(pred_total).round().astype(int)

gold_res = model_gold.predict(data[features]) - data[target_gold]
total_res = model_total.predict(data[features]) - data[target_total]
gold_std = gold_res.std()
total_std = total_res.std()

future_data['Gold_Lower'] = (future_data['Predicted_Gold'] - 2*gold_std).clip(lower=0).round().astype(int)
future_data['Gold_Upper'] = (future_data['Predicted_Gold'] + 2*gold_std).clip(lower=0).round().astype(int)
future_data['Total_Lower'] = (future_data['Predicted_Total'] - 2*total_std).clip(lower=0).round().astype(int)
future_data['Total_Upper'] = (future_data['Predicted_Total'] + 2*total_std).clip(lower=0).round().astype(int)

future_data.sort_values(by='Predicted_Total', ascending=False, inplace=True)
print("Predicted Medal Table for 2028 Los Angeles Olympics:")
print(
    future_data[['NOC','Predicted_Gold','Gold_Lower','Gold_Upper',
                 'Predicted_Total','Total_Lower','Total_Upper']]
)

future_data.to_csv('Predicted_Medal_Table_2028.csv', index=False)
print("Results saved to 'Predicted_Medal_Table_2028.csv'")

plt.figure(figsize=(12, 8))
sns.barplot(x='NOC', y='Predicted_Total', data=future_data.head(10), palette='viridis')
plt.title('Predicted Total Medals in 2028 (Vertical Bars)')
plt.xlabel('Country (NOC)')
plt.ylabel('Predicted Total Medals')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# 进步/退步分析
future_data['Gold_History'] = (
    data.groupby('NOC')['Past_Gold_Avg'].mean()
        .reindex(unique_nocs).fillna(0).values
)
future_data['Total_History'] = (
    data.groupby('NOC')['Past_Total_Avg'].mean()
        .reindex(unique_nocs).fillna(0).values
)

future_data['Gold_Progress'] = future_data['Predicted_Gold'] - future_data['Gold_History']
future_data['Total_Progress'] = future_data['Predicted_Total'] - future_data['Total_History']
future_data['Gold_Trend'] = np.where(future_data['Gold_Progress']>0, 'Progress','Decline')
future_data['Total_Trend'] = np.where(future_data['Total_Progress']>0, 'Progress','Decline')

progress_gold = future_data[['NOC','Predicted_Gold','Gold_Progress','Gold_Trend']]\
                .sort_values('Gold_Progress', ascending=False)
decline_gold = future_data[['NOC','Predicted_Gold','Gold_Progress','Gold_Trend']]\
               .sort_values('Gold_Progress', ascending=True)
progress_total = future_data[['NOC','Predicted_Total','Total_Progress','Total_Trend']]\
                 .sort_values('Total_Progress', ascending=False)
decline_total = future_data[['NOC','Predicted_Total','Total_Progress','Total_Trend']]\
                .sort_values('Total_Progress', ascending=True)

print("\nCountries Most Likely to Improve in Gold:")
print(progress_gold.head(10))
print("\nCountries Most Likely to Decline in Gold:")
print(decline_gold.head(10))
print("\nCountries Most Likely to Improve in Total Medals:")
print(progress_total.head(10))
print("\nCountries Most Likely to Decline in Total Medals:")
print(decline_total.head(10))

# 画进步/退步图
plt.figure(figsize=(12, 8))
sns.barplot(x='NOC', y='Gold_Progress', data=progress_gold.head(10), palette='Blues')
plt.title('Top 10 Countries Likely to Improve in Gold (Vertical Bars)')
plt.xlabel('Country (NOC)')
plt.ylabel('Gold Medal Progress')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 8))
sns.barplot(x='NOC', y='Gold_Progress', data=decline_gold.head(10), palette='Reds')
plt.title('Top 10 Countries Likely to Decline in Gold (Vertical Bars)')
plt.xlabel('Country (NOC)')
plt.ylabel('Gold Medal Decline')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 8))
sns.barplot(x='NOC', y='Total_Progress', data=progress_total.head(10), palette='Blues')
plt.title('Top 10 Countries Likely to Improve in Total Medals (Vertical Bars)')
plt.xlabel('Country (NOC)')
plt.ylabel('Total Medal Progress')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 8))
sns.barplot(x='NOC', y='Total_Progress', data=decline_total.head(10), palette='Reds')
plt.title('Top 10 Countries Likely to Decline in Total Medals (Vertical Bars)')
plt.xlabel('Country (NOC)')
plt.ylabel('Total Medal Decline')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# 金牌预测饼图
future_data_gold_sorted = future_data.sort_values('Predicted_Gold', ascending=False)
top10_gold = future_data_gold_sorted.head(10)
others_gold = future_data_gold_sorted.iloc[10:]['Predicted_Gold'].sum()
labels = list(top10_gold['NOC'].values) + ['Others']
values = list(top10_gold['Predicted_Gold'].values) + [others_gold]

top_colors = sns.color_palette('Set2', n_colors=len(top10_gold))
others_color = [(0.5,0.5,0.5)]
colors = top_colors + others_color
explode = [0]*(len(top10_gold)) + [0.1]

plt.figure(figsize=(8,8))
plt.pie(
    values,
    labels=labels,
    colors=colors,
    explode=explode,
    autopct='%1.1f%%',
    startangle=160,
    wedgeprops={'alpha':0.9},
    labeldistance=1.12,
    pctdistance=0.75
)
plt.title("Predicted Gold Share: Top 10 vs. Others (2D Pie)")
plt.axis('equal')
plt.tight_layout()
plt.show()

要点说明：

数据读取：
- 读取 summerOly_athletes.csv 和 summerOly_medal_counts.csv。
- 你需要提前将这些 CSV 文件放在运行脚本的同一目录下（或自行修改 pd.read_csv 的路径）。
建模与预测：
- 利用过去3届（窗口=3）的滚动均值、滚动总和作为特征。
- 使用 LinearRegression 训练模型后，分别预测 2028 年金牌数量和总奖牌数，并给出区间范围（上下界）。
进步/退步分析：
- 以历史平均金牌（Past_Gold_Avg）或总奖牌（Past_Total_Avg）为基准，比较预测值和历史均值的差值，从而评估各国家可能的进步或退步幅度。
可视化：
- 多个柱状图与饼图，展示各国金牌预测、进步/退步榜单等。

如果后续还有任何疑问或需要更改的细节，欢迎再与我交流。

奥运奖牌预测分析

Pergunta

1) 第一段代码: 奥运奖牌预测 & 进步退步分析

读取 athletes & medal_counts

Merge & Preprocess

Rolling historical features

Predict for 2028

2) 第二段代码: “断轴”示例 (保留原逻辑 + 输出)

========== PART A: 生成 yearly_summary并画两张图 ==========

========== PART B: 生成“平滑”模拟数据并画折线图 ==========

固定随机种子，方便复现

国家与项目列表

构造示例数据

教练对部分国家/项目的加成

奖牌数据加上教练加成

--------让图表更平滑的关键：对 Sport 做整数映射--------

在原 df 上添加 SportIndex 列

排序后方便下面同时导出 Excel

导出到 Excel 便于查看/复用

----------绘图：x=SportIndex, y=Gold，按国家分别画-----------

设置 x 轴刻度：整数->项目名称

Pensamento

Resposta

说明

Pergunta

Pensamento

Resposta

Compartilhar esta Q&A