奥运奖牌预测分析
Criado em: 27 de janeiro de 2025
Criado em: 27 de janeiro de 2025
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
###############################################################
###############################################################
athletes = pd.read_csv('summerOly_athletes.csv', encoding='ISO-8859-1')
medal_counts = pd.read_csv('summerOly_medal_counts.csv', encoding='ISO-8859-1')
medal_summary = medal_counts.groupby(['Year','NOC'], as_index=False).agg({
'Gold':'sum','Silver':'sum','Bronze':'sum','Total':'sum'
})
athlete_summary = athletes.groupby(['Year','NOC'], as_index=False).size().rename(columns={'size':'Athletes'})
data = pd.merge(medal_summary, athlete_summary, on=['Year','NOC'], how='left').fillna(0)
data.sort_values(['NOC','Year'], inplace=True)
data['Past_Gold_Avg'] = data.groupby('NOC')['Gold'].transform(
lambda x: x.shift().rolling(window=3, min_periods=1).mean())
data['Past_Total_Avg'] = data.groupby('NOC')['Total'].transform(
lambda x: x.shift().rolling(window=3, min_periods=1).mean())
data['Past_Gold_Sum'] = data.groupby('NOC')['Gold'].transform(
lambda x: x.shift().rolling(window=3, min_periods=1).sum())
data['Past_Total_Sum'] = data.groupby('NOC')['Total'].transform(
lambda x: x.shift().rolling(window=3, min_periods=1).sum())
data[['Past_Gold_Avg','Past_Total_Avg','Past_Gold_Sum','Past_Total_Sum']] =
data[['Past_Gold_Avg','Past_Total_Avg','Past_Gold_Sum','Past_Total_Sum']].fillna(0)
features = ['Past_Gold_Avg','Past_Total_Avg','Past_Gold_Sum','Past_Total_Sum']
target_gold = 'Gold'
target_total = 'Total'
def build_model(df, feats, target):
X = df[feats]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2,
random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Model for {target}: MSE = {mse}")
return model
model_gold = build_model(data, features, target_gold)
model_total = build_model(data, features, target_total)
unique_nocs = data['NOC'].unique()
df_2024 = data[data['Year']==2024].copy()
group_2024 = df_2024.groupby('NOC', as_index=False)[features].mean()
future_data = pd.DataFrame({'NOC': unique_nocs})
future_data = pd.merge(future_data, group_2024, on='NOC', how='left').fillna(0)
pred_gold = model_gold.predict(future_data[features])
pred_total = model_total.predict(future_data[features])
pred_gold = np.clip(pred_gold, a_min=0, a_max=None)
pred_total = np.clip(pred_total, a_min=0, a_max=None)
future_data['Predicted_Gold'] = pd.Series(pred_gold).round().astype(int)
future_data['Predicted_Total'] = pd.Series(pred_total).round().astype(int)
gold_res = model_gold.predict(data[features]) - data[target_gold]
total_res = model_total.predict(data[features]) - data[target_total]
gold_std = gold_res.std()
total_std = total_res.std()
future_data['Gold_Lower'] = (future_data['Predicted_Gold'] - 2gold_std).clip(lower=0).round().astype(int)
future_data['Gold_Upper'] = (future_data['Predicted_Gold'] + 2gold_std).clip(lower=0).round().astype(int)
future_data['Total_Lower'] = (future_data['Predicted_Total'] - 2total_std).clip(lower=0).round().astype(int)
future_data['Total_Upper'] = (future_data['Predicted_Total'] + 2total_std).clip(lower=0).round().astype(int)
future_data.sort_values(by='Predicted_Total', ascending=False, inplace=True)
print("Predicted Medal Table for 2028 Los Angeles Olympics:")
print(
future_data[['NOC','Predicted_Gold','Gold_Lower','Gold_Upper',
'Predicted_Total','Total_Lower','Total_Upper']]
)
future_data.to_csv('Predicted_Medal_Table_2028.csv', index=False)
print("Results saved to 'Predicted_Medal_Table_2028.csv'")
plt.figure(figsize=(12, 8))
sns.barplot(x='NOC', y='Predicted_Total', data=future_data.head(10), palette='viridis')
plt.title('Predicted Total Medals in 2028 (Vertical Bars)')
plt.xlabel('Country (NOC)')
plt.ylabel('Predicted Total Medals')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
future_data['Gold_History'] = data.groupby('NOC')['Past_Gold_Avg'].mean().reindex(unique_nocs).fillna(0).values
future_data['Total_History'] = data.groupby('NOC')['Past_Total_Avg'].mean().reindex(unique_nocs).fillna(0).values
future_data['Gold_Progress'] = future_data['Predicted_Gold'] - future_data['Gold_History']
future_data['Total_Progress'] = future_data['Predicted_Total'] - future_data['Total_History']
future_data['Gold_Trend'] = np.where(future_data['Gold_Progress']>0, 'Progress','Decline')
future_data['Total_Trend'] = np.where(future_data['Total_Progress']>0, 'Progress','Decline')
progress_gold = future_data[['NOC','Predicted_Gold','Gold_Progress','Gold_Trend']]
.sort_values('Gold_Progress', ascending=False)
decline_gold = future_data[['NOC','Predicted_Gold','Gold_Progress','Gold_Trend']]
.sort_values('Gold_Progress', ascending=True)
progress_total = future_data[['NOC','Predicted_Total','Total_Progress','Total_Trend']]
.sort_values('Total_Progress', ascending=False)
decline_total = future_data[['NOC','Predicted_Total','Total_Progress','Total_Trend']]
.sort_values('Total_Progress', ascending=True)
print("\nCountries Most Likely to Improve in Gold:")
print(progress_gold.head(10))
print("\nCountries Most Likely to Decline in Gold:")
print(decline_gold.head(10))
print("\nCountries Most Likely to Improve in Total Medals:")
print(progress_total.head(10))
print("\nCountries Most Likely to Decline in Total Medals:")
print(decline_total.head(10))
plt.figure(figsize=(12, 8))
sns.barplot(x='NOC', y='Gold_Progress', data=progress_gold.head(10), palette='Blues')
plt.title('Top 10 Countries Likely to Improve in Gold (Vertical Bars)')
plt.xlabel('Country (NOC)')
plt.ylabel('Gold Medal Progress')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
plt.figure(figsize=(12, 8))
sns.barplot(x='NOC', y='Gold_Progress', data=decline_gold.head(10), palette='Reds')
plt.title('Top 10 Countries Likely to Decline in Gold (Vertical Bars)')
plt.xlabel('Country (NOC)')
plt.ylabel('Gold Medal Decline')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
plt.figure(figsize=(12, 8))
sns.barplot(x='NOC', y='Total_Progress', data=progress_total.head(10), palette='Blues')
plt.title('Top 10 Countries Likely to Improve in Total Medals (Vertical Bars)')
plt.xlabel('Country (NOC)')
plt.ylabel('Total Medal Progress')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
plt.figure(figsize=(12, 8))
sns.barplot(x='NOC', y='Total_Progress', data=decline_total.head(10), palette='Reds')
plt.title('Top 10 Countries Likely to Decline in Total Medals (Vertical Bars)')
plt.xlabel('Country (NOC)')
plt.ylabel('Total Medal Decline')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
future_data_gold_sorted = future_data.sort_values('Predicted_Gold', ascending=False)
top10_gold = future_data_gold_sorted.head(10)
others_gold = future_data_gold_sorted.iloc[10:]['Predicted_Gold'].sum()
labels = list(top10_gold['NOC'].values) + ['Others']
values = list(top10_gold['Predicted_Gold'].values) + [others_gold]
top_colors = sns.color_palette('Set2', n_colors=len(top10_gold))
others_color = [(0.5,0.5,0.5)]
colors = top_colors + others_color
explode = [0]*(len(top10_gold)) + [0.1]
plt.figure(figsize=(8,8))
plt.pie(
values,
labels=labels,
colors=colors,
explode=explode,
autopct='%1.1f%%',
startangle=160,
wedgeprops={'alpha':0.9},
labeldistance=1.12,
pctdistance=0.75
)
plt.title("Predicted Gold Share: Top 10 vs. Others (2D Pie)")
plt.axis('equal')
plt.tight_layout()
plt.show()
###############################################################
###############################################################
countries = ["Yugoslavia","Turkey","Croatia","Canada","Costa Rica",
"Latvia","Indonesia","Nigeria","Kyrgyzstan","Finland"]
values = [0.67,0.65,0.52,0.50,0.49,0.48,0.48,0.47,0.47,0.46]
gold_prob_mod = pd.Series(data=values, index=countries)
x_labels = gold_prob_mod.index
x_vals = np.arange(len(x_labels))
fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(8,6))
fig.subplots_adjust(hspace=0.05)
ax1.bar(x_vals, gold_prob_mod, color='skyblue', edgecolor='black')
ax1.set_ylim(0.45, 0.7)
ax1.spines['bottom'].set_visible(False)
ax1.tick_params(labelbottom=False)
ax2.bar(x_vals, gold_prob_mod, color='skyblue', edgecolor='black')
ax2.set_ylim(0, 0.4)
d = .01
kwargs = dict(transform=ax1.transAxes, color='k', clip_on=False)
ax1.plot((-d, +d), (-d, +d), **kwargs)
ax1.plot((1 - d, 1 + d), (-d, +d), **kwargs)
kwargs = dict(transform=ax2.transAxes, color='k', clip_on=False)
ax2.plot((-d, +d), (1 - d, 1 + d), **kwargs)
ax2.plot((1 - d, 1 + d), (1 - d, 1 + d), **kwargs)
ax2.set_xticks(x_vals)
ax2.set_xticklabels(x_labels, rotation=45, ha='right')
ax2.set_xlabel("Country (NOC)")
ax2.set_ylabel("Probability of 'trend>0'")
ax1.set_title("Top 10 Zero-Gold Countries (Broken Y-Axis)")
plt.tight_layout()
plt.show()
###############################################################
###############################################################
base_dir = r"D:\浏览器下载\pycharm\olympic\my_olympic_project.venv"
programs_file_path = os.path.join(base_dir, 'summerOly_programs.csv')
medal_counts_file_path = os.path.join(base_dir, 'summerOly_medal_counts.csv')
output_excel = os.path.join(base_dir, 'yearly_summary.xlsx')
programs_df = pd.read_csv(programs_file_path, encoding='ISO-8859-1')
medal_counts_df = pd.read_csv(medal_counts_file_path, encoding='ISO-8859-1')
programs_df_cleaned = programs_df.loc[:, ~programs_df.columns.str.contains("Code|Sport|Discipline|Sports Governing Body")]
programs_df_cleaned = programs_df_cleaned.rename(columns=lambda x: x.strip().replace('*','') if isinstance(x, str) else x)
programs_df_cleaned = programs_df_cleaned.apply(pd.to_numeric, errors='coerce')
programs_melted_cleaned = programs_df_cleaned.melt(var_name='Year', value_name='Number_of_Events')
programs_melted_cleaned['Year'] = pd.to_numeric(programs_melted_cleaned['Year'], errors='coerce', downcast='integer')
medal_counts_per_year = medal_counts_df.groupby(['Year','NOC'])['Gold'].sum().reset_index()
merged_data_cleaned = pd.merge(medal_counts_per_year, programs_melted_cleaned, how='left', on='Year')
yearly_summary = merged_data_cleaned.groupby('Year').agg({
'Number_of_Events':'sum',
'Gold':'sum'
}).reset_index()
yearly_summary['Gold_to_Event_Ratio'] = yearly_summary['Gold'] / yearly_summary['Number_of_Events']
yearly_summary['Gold_to_Event_Ratio'] = yearly_summary['Gold_to_Event_Ratio'].fillna(0)
yearly_summary.to_excel(output_excel, index=False)
print(f"yearly_summary 已保存至: {output_excel}")
print(yearly_summary.head(10))
plt.figure(figsize=(12, 6))
plt.plot(yearly_summary['Year'], yearly_summary['Gold_to_Event_Ratio'], marker='o', color='purple')
plt.title("Gold to Event Ratio per Year (1896-2024)", fontsize=14)
plt.xlabel("Year", fontsize=12)
plt.ylabel("Gold to Event Ratio", fontsize=12)
plt.tight_layout()
plt.show()
plt.figure(figsize=(12, 6))
plt.plot(yearly_summary['Year'], yearly_summary['Number_of_Events'], label="Number of Events", marker='o', color='b')
plt.plot(yearly_summary['Year'], yearly_summary['Gold'], label="Gold Medals", marker='o', color='g')
plt.title("Relationship between Number of Events and Gold Medal Counts (1896-2024)", fontsize=14)
plt.xlabel("Year", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.legend(title="Legend")
plt.tight_layout()
plt.show()
###############################################################
###############################################################
N = 20
x_start = 10
x_vals = np.arange(x_start, x_start + N)
gold_list = []
rng = np.random.default_rng(2028)
for x in x_vals:
base = 2*x + 10
wave = math.sin(x/2)*3
y = round(base + wave)
gold_list.append(y)
df_line = pd.DataFrame({
'Number_of_Events': x_vals,
'Gold': gold_list
})
df_line.sort_values('Number_of_Events', inplace=True)
excel_name = "smooth_line_data_no_host.xlsx"
df_line.to_excel(excel_name, index=False)
print(f"Data has been saved to {excel_name}")
print(df_line.head(10))
plt.figure(figsize=(10,6))
plt.plot(
df_line['Number_of_Events'],
df_line['Gold'],
marker='o', color='blue', linewidth=2
)
plt.title("Smooth Relationship: Number_of_Events vs. Gold (No Host Column)", fontsize=14)
plt.xlabel("Number_of_Events (integer)", fontsize=12)
plt.ylabel("Gold (simulated)", fontsize=12)
plt.xticks(df_line['Number_of_Events'])
plt.tight_layout()
plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
np.random.seed(42)
countries = ['China', 'USA', 'Romania', 'Japan', 'Russia', 'Germany', 'Brazil']
sports = ['Volleyball', 'Gymnastics', 'Basketball', 'Football', 'Tennis', 'Swimming', 'Track & Field']
medals_data = []
for country in countries:
for sport in sports:
gold = np.random.randint(0, 10) # 金牌
silver = np.random.randint(0, 10) # 银牌
bronze = np.random.randint(0, 10) # 铜牌
total = gold + silver + bronze
medals_data.append([country, sport, gold, silver, bronze, total])
df = pd.DataFrame(medals_data, columns=['Country','Sport','Gold','Silver','Bronze','Total'])
coach_effect = {
'China': {'Volleyball': 3},
'USA': {'Gymnastics': 4},
'Romania': {'Gymnastics': 5}
}
for country, effect_dict in coach_effect.items():
for sport, effect in effect_dict.items():
df.loc[(df['Country']==country)&(df['Sport']==sport), 'Gold'] += effect
print("Updated Medal Data with Coach Effects:")
print(df.head(15)) # 打印前15行查看
sports_sorted = sorted(sports) # 对项目按字母排序
sport_to_idx = {s: i for i, s in enumerate(sports_sorted)}
df['SportIndex'] = df['Sport'].map(sport_to_idx)
df.sort_values(['Country','SportIndex'], inplace=True)
excel_output = "chart_data_coach_effect.xlsx"
df.to_excel(excel_output, index=False)
print(f"\nData used for chart saved to {excel_output}\n")
plt.figure(figsize=(10, 6))
for country in countries:
# 过滤该国数据
cdata = df[df['Country']==country]
# 画折线
plt.plot(cdata['SportIndex'], cdata['Gold'], marker='o', linewidth=2, label=country)
plt.title("Gold Medals by Sport (with Coach Effect, Smoothed X-axis)", fontsize=14)
plt.xlabel("Sport (Mapped to Index)", fontsize=12)
plt.ylabel("Gold Medals", fontsize=12)
plt.xticks(range(len(sports_sorted)), sports_sorted, rotation=45)
plt.legend(title="Country")
plt.tight_layout()
plt.show()请你整合这两段代码,保证他们的各自输出的结果不变
下面给出一份将所有代码按原顺序整合在一起的示例脚本。这样做可以确保每个部分的逻辑和输出都与原本相同,互不干扰。你可以直接将此脚本保存为一个 .py
文件运行,或者在交互式环境(Jupyter/VSCode/Spyder等)中逐段执行。
pythonimport os import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import math from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error ########################################################################## # 1) 第一段代码: 奥运奖牌预测 & 进步退步分析 ########################################################################## # 读取 athletes & medal_counts athletes = pd.read_csv('summerOly_athletes.csv', encoding='ISO-8859-1') medal_counts = pd.read_csv('summerOly_medal_counts.csv', encoding='ISO-8859-1') # Merge & Preprocess medal_summary = medal_counts.groupby(['Year','NOC'], as_index=False).agg({ 'Gold':'sum','Silver':'sum','Bronze':'sum','Total':'sum' }) athlete_summary = athletes.groupby(['Year','NOC'], as_index=False).size().rename(columns={'size':'Athletes'}) data = pd.merge(medal_summary, athlete_summary, on=['Year','NOC'], how='left').fillna(0) # Rolling historical features data.sort_values(['NOC','Year'], inplace=True) data['Past_Gold_Avg'] = data.groupby('NOC')['Gold'].transform( lambda x: x.shift().rolling(window=3, min_periods=1).mean()) data['Past_Total_Avg'] = data.groupby('NOC')['Total'].transform( lambda x: x.shift().rolling(window=3, min_periods=1).mean()) data['Past_Gold_Sum'] = data.groupby('NOC')['Gold'].transform( lambda x: x.shift().rolling(window=3, min_periods=1).sum()) data['Past_Total_Sum'] = data.groupby('NOC')['Total'].transform( lambda x: x.shift().rolling(window=3, min_periods=1).sum()) data[['Past_Gold_Avg','Past_Total_Avg','Past_Gold_Sum','Past_Total_Sum']] = \ data[['Past_Gold_Avg','Past_Total_Avg','Past_Gold_Sum','Past_Total_Sum']].fillna(0) features = ['Past_Gold_Avg','Past_Total_Avg','Past_Gold_Sum','Past_Total_Sum'] target_gold = 'Gold' target_total = 'Total' def build_model(df, feats, target): X = df[feats] y = df[target] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) model = LinearRegression() model.fit(X_train, y_train) y_pred = model.predict(X_test) mse = mean_squared_error(y_test, y_pred) print(f"Model for {target}: MSE = {mse}") return model model_gold = build_model(data, features, target_gold) model_total = build_model(data, features, target_total) # Predict for 2028 unique_nocs = data['NOC'].unique() df_2024 = data[data['Year']==2024].copy() group_2024 = df_2024.groupby('NOC', as_index=False)[features].mean() future_data = pd.DataFrame({'NOC': unique_nocs}) future_data = pd.merge(future_data, group_2024, on='NOC', how='left').fillna(0) pred_gold = model_gold.predict(future_data[features]) pred_total = model_total.predict(future_data[features]) pred_gold = np.clip(pred_gold, a_min=0, a_max=None) pred_total = np.clip(pred_total, a_min=0, a_max=None) future_data['Predicted_Gold'] = pd.Series(pred_gold).round().astype(int) future_data['Predicted_Total'] = pd.Series(pred_total).round().astype(int) gold_res = model_gold.predict(data[features]) - data[target_gold] total_res = model_total.predict(data[features]) - data[target_total] gold_std = gold_res.std() total_std = total_res.std() future_data['Gold_Lower'] = (future_data['Predicted_Gold'] - 2*gold_std).clip(lower=0).round().astype(int) future_data['Gold_Upper'] = (future_data['Predicted_Gold'] + 2*gold_std).clip(lower=0).round().astype(int) future_data['Total_Lower'] = (future_data['Predicted_Total'] - 2*total_std).clip(lower=0).round().astype(int) future_data['Total_Upper'] = (future_data['Predicted_Total'] + 2*total_std).clip(lower=0).round().astype(int) future_data.sort_values(by='Predicted_Total', ascending=False, inplace=True) print("Predicted Medal Table for 2028 Los Angeles Olympics:") print( future_data[['NOC','Predicted_Gold','Gold_Lower','Gold_Upper', 'Predicted_Total','Total_Lower','Total_Upper']] ) future_data.to_csv('Predicted_Medal_Table_2028.csv', index=False) print("Results saved to 'Predicted_Medal_Table_2028.csv'") plt.figure(figsize=(12, 8)) sns.barplot(x='NOC', y='Predicted_Total', data=future_data.head(10), palette='viridis') plt.title('Predicted Total Medals in 2028 (Vertical Bars)') plt.xlabel('Country (NOC)') plt.ylabel('Predicted Total Medals') plt.xticks(rotation=45, ha='right') plt.tight_layout() plt.show() future_data['Gold_History'] = data.groupby('NOC')['Past_Gold_Avg'].mean().reindex(unique_nocs).fillna(0).values future_data['Total_History'] = data.groupby('NOC')['Past_Total_Avg'].mean().reindex(unique_nocs).fillna(0).values future_data['Gold_Progress'] = future_data['Predicted_Gold'] - future_data['Gold_History'] future_data['Total_Progress'] = future_data['Predicted_Total'] - future_data['Total_History'] future_data['Gold_Trend'] = np.where(future_data['Gold_Progress']>0, 'Progress','Decline') future_data['Total_Trend'] = np.where(future_data['Total_Progress']>0, 'Progress','Decline') progress_gold = future_data[['NOC','Predicted_Gold','Gold_Progress','Gold_Trend']]\ .sort_values('Gold_Progress', ascending=False) decline_gold = future_data[['NOC','Predicted_Gold','Gold_Progress','Gold_Trend']]\ .sort_values('Gold_Progress', ascending=True) progress_total = future_data[['NOC','Predicted_Total','Total_Progress','Total_Trend']]\ .sort_values('Total_Progress', ascending=False) decline_total = future_data[['NOC','Predicted_Total','Total_Progress','Total_Trend']]\ .sort_values('Total_Progress', ascending=True) print("\nCountries Most Likely to Improve in Gold:") print(progress_gold.head(10)) print("\nCountries Most Likely to Decline in Gold:") print(decline_gold.head(10)) print("\nCountries Most Likely to Improve in Total Medals:") print(progress_total.head(10)) print("\nCountries Most Likely to Decline in Total Medals:") print(decline_total.head(10)) plt.figure(figsize=(12, 8)) sns.barplot(x='NOC', y='Gold_Progress', data=progress_gold.head(10), palette='Blues') plt.title('Top 10 Countries Likely to Improve in Gold (Vertical Bars)') plt.xlabel('Country (NOC)') plt.ylabel('Gold Medal Progress') plt.xticks(rotation=45, ha='right') plt.tight_layout() plt.show() plt.figure(figsize=(12, 8)) sns.barplot(x='NOC', y='Gold_Progress', data=decline_gold.head(10), palette='Reds') plt.title('Top 10 Countries Likely to Decline in Gold (Vertical Bars)') plt.xlabel('Country (NOC)') plt.ylabel('Gold Medal Decline') plt.xticks(rotation=45, ha='right') plt.tight_layout() plt.show() plt.figure(figsize=(12, 8)) sns.barplot(x='NOC', y='Total_Progress', data=progress_total.head(10), palette='Blues') plt.title('Top 10 Countries Likely to Improve in Total Medals (Vertical Bars)') plt.xlabel('Country (NOC)') plt.ylabel('Total Medal Progress') plt.xticks(rotation=45, ha='right') plt.tight_layout() plt.show() plt.figure(figsize=(12, 8)) sns.barplot(x='NOC', y='Total_Progress', data=decline_total.head(10), palette='Reds') plt.title('Top 10 Countries Likely to Decline in Total Medals (Vertical Bars)') plt.xlabel('Country (NOC)') plt.ylabel('Total Medal Decline') plt.xticks(rotation=45, ha='right') plt.tight_layout() plt.show() future_data_gold_sorted = future_data.sort_values('Predicted_Gold', ascending=False) top10_gold = future_data_gold_sorted.head(10) others_gold = future_data_gold_sorted.iloc[10:]['Predicted_Gold'].sum() labels = list(top10_gold['NOC'].values) + ['Others'] values = list(top10_gold['Predicted_Gold'].values) + [others_gold] top_colors = sns.color_palette('Set2', n_colors=len(top10_gold)) others_color = [(0.5,0.5,0.5)] colors = top_colors + others_color explode = [0]*(len(top10_gold)) + [0.1] plt.figure(figsize=(8,8)) plt.pie( values, labels=labels, colors=colors, explode=explode, autopct='%1.1f%%', startangle=160, wedgeprops={'alpha':0.9}, labeldistance=1.12, pctdistance=0.75 ) plt.title("Predicted Gold Share: Top 10 vs. Others (2D Pie)") plt.axis('equal') plt.tight_layout() plt.show() ########################################################################## # 2) 第二段代码: “断轴”示例 (保留原逻辑 + 输出) ########################################################################## countries = ["Yugoslavia","Turkey","Croatia","Canada","Costa Rica", "Latvia","Indonesia","Nigeria","Kyrgyzstan","Finland"] values = [0.67,0.65,0.52,0.50,0.49,0.48,0.48,0.47,0.47,0.46] gold_prob_mod = pd.Series(data=values, index=countries) x_labels = gold_prob_mod.index x_vals = np.arange(len(x_labels)) fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(8,6)) fig.subplots_adjust(hspace=0.05) ax1.bar(x_vals, gold_prob_mod, color='skyblue', edgecolor='black') ax1.set_ylim(0.45, 0.7) ax1.spines['bottom'].set_visible(False) ax1.tick_params(labelbottom=False) ax2.bar(x_vals, gold_prob_mod, color='skyblue', edgecolor='black') ax2.set_ylim(0, 0.4) d = .01 kwargs = dict(transform=ax1.transAxes, color='k', clip_on=False) ax1.plot((-d, +d), (-d, +d), **kwargs) ax1.plot((1 - d, 1 + d), (-d, +d), **kwargs) kwargs = dict(transform=ax2.transAxes, color='k', clip_on=False) ax2.plot((-d, +d), (1 - d, 1 + d), **kwargs) ax2.plot((1 - d, 1 + d), (1 - d, 1 + d), **kwargs) ax2.set_xticks(x_vals) ax2.set_xticklabels(x_labels, rotation=45, ha='right') ax2.set_xlabel("Country (NOC)") ax2.set_ylabel("Probability of 'trend>0'") ax1.set_title("Top 10 Zero-Gold Countries (Broken Y-Axis)") plt.tight_layout() plt.show() ########################################################################## # ========== PART A: 生成 yearly_summary 并画两张图 ========== ########################################################################## base_dir = r"D:\浏览器下载\pycharm\olympic\my_olympic_project\.venv" programs_file_path = os.path.join(base_dir, 'summerOly_programs.csv') medal_counts_file_path = os.path.join(base_dir, 'summerOly_medal_counts.csv') output_excel = os.path.join(base_dir, 'yearly_summary.xlsx') programs_df = pd.read_csv(programs_file_path, encoding='ISO-8859-1') medal_counts_df = pd.read_csv(medal_counts_file_path, encoding='ISO-8859-1') programs_df_cleaned = programs_df.loc[:, ~programs_df.columns.str.contains("Code|Sport|Discipline|Sports Governing Body")] programs_df_cleaned = programs_df_cleaned.rename(columns=lambda x: x.strip().replace('*','') if isinstance(x, str) else x) programs_df_cleaned = programs_df_cleaned.apply(pd.to_numeric, errors='coerce') programs_melted_cleaned = programs_df_cleaned.melt(var_name='Year', value_name='Number_of_Events') programs_melted_cleaned['Year'] = pd.to_numeric(programs_melted_cleaned['Year'], errors='coerce', downcast='integer') medal_counts_per_year = medal_counts_df.groupby(['Year','NOC'])['Gold'].sum().reset_index() merged_data_cleaned = pd.merge(medal_counts_per_year, programs_melted_cleaned, how='left', on='Year') yearly_summary = merged_data_cleaned.groupby('Year').agg({ 'Number_of_Events':'sum', 'Gold':'sum' }).reset_index() yearly_summary['Gold_to_Event_Ratio'] = yearly_summary['Gold'] / yearly_summary['Number_of_Events'] yearly_summary['Gold_to_Event_Ratio'] = yearly_summary['Gold_to_Event_Ratio'].fillna(0) yearly_summary.to_excel(output_excel, index=False) print(f"yearly_summary 已保存至: {output_excel}") print(yearly_summary.head(10)) plt.figure(figsize=(12, 6)) plt.plot(yearly_summary['Year'], yearly_summary['Gold_to_Event_Ratio'], marker='o', color='purple') plt.title("Gold to Event Ratio per Year (1896-2024)", fontsize=14) plt.xlabel("Year", fontsize=12) plt.ylabel("Gold to Event Ratio", fontsize=12) plt.tight_layout() plt.show() plt.figure(figsize=(12, 6)) plt.plot(yearly_summary['Year'], yearly_summary['Number_of_Events'], label="Number of Events", marker='o', color='b') plt.plot(yearly_summary['Year'], yearly_summary['Gold'], label="Gold Medals", marker='o', color='g') plt.title("Relationship between Number of Events and Gold Medal Counts (1896-2024)", fontsize=14) plt.xlabel("Year", fontsize=12) plt.ylabel("Count", fontsize=12) plt.legend(title="Legend") plt.tight_layout() plt.show() ########################################################################## # ========== PART B: 生成“平滑”模拟数据并画折线图 ========== ########################################################################## N = 20 x_start = 10 x_vals = np.arange(x_start, x_start + N) gold_list = [] rng = np.random.default_rng(2028) for x in x_vals: base = 2*x + 10 wave = math.sin(x/2)*3 y = round(base + wave) gold_list.append(y) df_line = pd.DataFrame({ 'Number_of_Events': x_vals, 'Gold': gold_list }) df_line.sort_values('Number_of_Events', inplace=True) excel_name = "smooth_line_data_no_host.xlsx" df_line.to_excel(excel_name, index=False) print(f"Data has been saved to {excel_name}") print(df_line.head(10)) plt.figure(figsize=(10,6)) plt.plot( df_line['Number_of_Events'], df_line['Gold'], marker='o', color='blue', linewidth=2 ) plt.title("Smooth Relationship: Number_of_Events vs. Gold (No Host Column)", fontsize=14) plt.xlabel("Number_of_Events (integer)", fontsize=12) plt.ylabel("Gold (simulated)", fontsize=12) plt.xticks(df_line['Number_of_Events']) plt.tight_layout() plt.show() ########################################################################## # ========== 示范:加入“教练加成”效果的折线图(示例数据) ========== ########################################################################## # 固定随机种子,方便复现 np.random.seed(42) # 国家与项目列表 countries = ['China', 'USA', 'Romania', 'Japan', 'Russia', 'Germany', 'Brazil'] sports = ['Volleyball', 'Gymnastics', 'Basketball', 'Football', 'Tennis', 'Swimming', 'Track & Field'] # 构造示例数据 medals_data = [] for country in countries: for sport in sports: gold = np.random.randint(0, 10) # 金牌 silver = np.random.randint(0, 10) # 银牌 bronze = np.random.randint(0, 10) # 铜牌 total = gold + silver + bronze medals_data.append([country, sport, gold, silver, bronze, total]) df = pd.DataFrame(medals_data, columns=['Country','Sport','Gold','Silver','Bronze','Total']) # 教练对部分国家/项目的加成 coach_effect = { 'China': {'Volleyball': 3}, 'USA': {'Gymnastics': 4}, 'Romania': {'Gymnastics': 5} } # 奖牌数据加上教练加成 for country, effect_dict in coach_effect.items(): for sport, effect in effect_dict.items(): df.loc[(df['Country']==country)&(df['Sport']==sport), 'Gold'] += effect print("Updated Medal Data with Coach Effects:") print(df.head(15)) # 打印前15行查看 # --------让图表更平滑的关键:对 Sport 做整数映射-------- sports_sorted = sorted(sports) # 对项目按字母排序 sport_to_idx = {s: i for i, s in enumerate(sports_sorted)} # 在原 df 上添加 SportIndex 列 df['SportIndex'] = df['Sport'].map(sport_to_idx) # 排序后方便下面同时导出 Excel df.sort_values(['Country','SportIndex'], inplace=True) # 导出到 Excel 便于查看/复用 excel_output = "chart_data_coach_effect.xlsx" df.to_excel(excel_output, index=False) print(f"\nData used for chart saved to {excel_output}\n") # ----------绘图:x=SportIndex, y=Gold,按国家分别画----------- plt.figure(figsize=(10, 6)) for country in countries: cdata = df[df['Country']==country] plt.plot(cdata['SportIndex'], cdata['Gold'], marker='o', linewidth=2, label=country) plt.title("Gold Medals by Sport (with Coach Effect, Smoothed X-axis)", fontsize=14) plt.xlabel("Sport (Mapped to Index)", fontsize=12) plt.ylabel("Gold Medals", fontsize=12) # 设置 x 轴刻度:整数->项目名称 plt.xticks(range(len(sports_sorted)), sports_sorted, rotation=45) plt.legend(title="Country") plt.tight_layout() plt.show()
导入顺序
所需的 import
都集中在脚本最开头,后续所有代码段都能复用相同的依赖环境。
各部分相对独立
PART A
、PART B
以及“教练加成”的示例数据,也都在不同的命名空间或使用了新的变量,不会对之前的预测分析造成影响。可按需精简或拆分
如果你只需要其中某些功能,可以自由注释掉不想执行的部分;但为了保证每段输出与原逻辑一致,这里保持了全部细节。
将上述脚本保存并运行后,即可在同一脚本中获取所有原先独立代码块的输出结果。
1
下面是第一段代码(奥运奖牌预测与进步退步分析)的完整内容。如果你只想要此部分,可将其独立保存/运行。与原先给出的脚本逻辑和输出保持一致。
pythonimport os import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import math from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error ############################################################### # 1) 第一段代码: 奥运奖牌预测 & 进步退步分析 ############################################################### # 读取 athletes & medal_counts athletes = pd.read_csv('summerOly_athletes.csv', encoding='ISO-8859-1') medal_counts = pd.read_csv('summerOly_medal_counts.csv', encoding='ISO-8859-1') # Merge & Preprocess medal_summary = medal_counts.groupby(['Year','NOC'], as_index=False).agg({ 'Gold':'sum','Silver':'sum','Bronze':'sum','Total':'sum' }) athlete_summary = athletes.groupby(['Year','NOC'], as_index=False).size().rename(columns={'size':'Athletes'}) data = pd.merge(medal_summary, athlete_summary, on=['Year','NOC'], how='left').fillna(0) # Rolling historical features data.sort_values(['NOC','Year'], inplace=True) data['Past_Gold_Avg'] = data.groupby('NOC')['Gold'].transform( lambda x: x.shift().rolling(window=3, min_periods=1).mean()) data['Past_Total_Avg'] = data.groupby('NOC')['Total'].transform( lambda x: x.shift().rolling(window=3, min_periods=1).mean()) data['Past_Gold_Sum'] = data.groupby('NOC')['Gold'].transform( lambda x: x.shift().rolling(window=3, min_periods=1).sum()) data['Past_Total_Sum'] = data.groupby('NOC')['Total'].transform( lambda x: x.shift().rolling(window=3, min_periods=1).sum()) data[['Past_Gold_Avg','Past_Total_Avg','Past_Gold_Sum','Past_Total_Sum']] = \ data[['Past_Gold_Avg','Past_Total_Avg','Past_Gold_Sum','Past_Total_Sum']].fillna(0) features = ['Past_Gold_Avg','Past_Total_Avg','Past_Gold_Sum','Past_Total_Sum'] target_gold = 'Gold' target_total = 'Total' def build_model(df, feats, target): X = df[feats] y = df[target] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 ) model = LinearRegression() model.fit(X_train, y_train) y_pred = model.predict(X_test) mse = mean_squared_error(y_test, y_pred) print(f"Model for {target}: MSE = {mse}") return model model_gold = build_model(data, features, target_gold) model_total = build_model(data, features, target_total) # Predict for 2028 unique_nocs = data['NOC'].unique() df_2024 = data[data['Year']==2024].copy() group_2024 = df_2024.groupby('NOC', as_index=False)[features].mean() future_data = pd.DataFrame({'NOC': unique_nocs}) future_data = pd.merge(future_data, group_2024, on='NOC', how='left').fillna(0) pred_gold = model_gold.predict(future_data[features]) pred_total = model_total.predict(future_data[features]) pred_gold = np.clip(pred_gold, a_min=0, a_max=None) pred_total = np.clip(pred_total, a_min=0, a_max=None) future_data['Predicted_Gold'] = pd.Series(pred_gold).round().astype(int) future_data['Predicted_Total'] = pd.Series(pred_total).round().astype(int) gold_res = model_gold.predict(data[features]) - data[target_gold] total_res = model_total.predict(data[features]) - data[target_total] gold_std = gold_res.std() total_std = total_res.std() future_data['Gold_Lower'] = (future_data['Predicted_Gold'] - 2*gold_std).clip(lower=0).round().astype(int) future_data['Gold_Upper'] = (future_data['Predicted_Gold'] + 2*gold_std).clip(lower=0).round().astype(int) future_data['Total_Lower'] = (future_data['Predicted_Total'] - 2*total_std).clip(lower=0).round().astype(int) future_data['Total_Upper'] = (future_data['Predicted_Total'] + 2*total_std).clip(lower=0).round().astype(int) future_data.sort_values(by='Predicted_Total', ascending=False, inplace=True) print("Predicted Medal Table for 2028 Los Angeles Olympics:") print( future_data[['NOC','Predicted_Gold','Gold_Lower','Gold_Upper', 'Predicted_Total','Total_Lower','Total_Upper']] ) future_data.to_csv('Predicted_Medal_Table_2028.csv', index=False) print("Results saved to 'Predicted_Medal_Table_2028.csv'") plt.figure(figsize=(12, 8)) sns.barplot(x='NOC', y='Predicted_Total', data=future_data.head(10), palette='viridis') plt.title('Predicted Total Medals in 2028 (Vertical Bars)') plt.xlabel('Country (NOC)') plt.ylabel('Predicted Total Medals') plt.xticks(rotation=45, ha='right') plt.tight_layout() plt.show() # 进步/退步分析 future_data['Gold_History'] = ( data.groupby('NOC')['Past_Gold_Avg'].mean() .reindex(unique_nocs).fillna(0).values ) future_data['Total_History'] = ( data.groupby('NOC')['Past_Total_Avg'].mean() .reindex(unique_nocs).fillna(0).values ) future_data['Gold_Progress'] = future_data['Predicted_Gold'] - future_data['Gold_History'] future_data['Total_Progress'] = future_data['Predicted_Total'] - future_data['Total_History'] future_data['Gold_Trend'] = np.where(future_data['Gold_Progress']>0, 'Progress','Decline') future_data['Total_Trend'] = np.where(future_data['Total_Progress']>0, 'Progress','Decline') progress_gold = future_data[['NOC','Predicted_Gold','Gold_Progress','Gold_Trend']]\ .sort_values('Gold_Progress', ascending=False) decline_gold = future_data[['NOC','Predicted_Gold','Gold_Progress','Gold_Trend']]\ .sort_values('Gold_Progress', ascending=True) progress_total = future_data[['NOC','Predicted_Total','Total_Progress','Total_Trend']]\ .sort_values('Total_Progress', ascending=False) decline_total = future_data[['NOC','Predicted_Total','Total_Progress','Total_Trend']]\ .sort_values('Total_Progress', ascending=True) print("\nCountries Most Likely to Improve in Gold:") print(progress_gold.head(10)) print("\nCountries Most Likely to Decline in Gold:") print(decline_gold.head(10)) print("\nCountries Most Likely to Improve in Total Medals:") print(progress_total.head(10)) print("\nCountries Most Likely to Decline in Total Medals:") print(decline_total.head(10)) # 画进步/退步图 plt.figure(figsize=(12, 8)) sns.barplot(x='NOC', y='Gold_Progress', data=progress_gold.head(10), palette='Blues') plt.title('Top 10 Countries Likely to Improve in Gold (Vertical Bars)') plt.xlabel('Country (NOC)') plt.ylabel('Gold Medal Progress') plt.xticks(rotation=45, ha='right') plt.tight_layout() plt.show() plt.figure(figsize=(12, 8)) sns.barplot(x='NOC', y='Gold_Progress', data=decline_gold.head(10), palette='Reds') plt.title('Top 10 Countries Likely to Decline in Gold (Vertical Bars)') plt.xlabel('Country (NOC)') plt.ylabel('Gold Medal Decline') plt.xticks(rotation=45, ha='right') plt.tight_layout() plt.show() plt.figure(figsize=(12, 8)) sns.barplot(x='NOC', y='Total_Progress', data=progress_total.head(10), palette='Blues') plt.title('Top 10 Countries Likely to Improve in Total Medals (Vertical Bars)') plt.xlabel('Country (NOC)') plt.ylabel('Total Medal Progress') plt.xticks(rotation=45, ha='right') plt.tight_layout() plt.show() plt.figure(figsize=(12, 8)) sns.barplot(x='NOC', y='Total_Progress', data=decline_total.head(10), palette='Reds') plt.title('Top 10 Countries Likely to Decline in Total Medals (Vertical Bars)') plt.xlabel('Country (NOC)') plt.ylabel('Total Medal Decline') plt.xticks(rotation=45, ha='right') plt.tight_layout() plt.show() # 金牌预测饼图 future_data_gold_sorted = future_data.sort_values('Predicted_Gold', ascending=False) top10_gold = future_data_gold_sorted.head(10) others_gold = future_data_gold_sorted.iloc[10:]['Predicted_Gold'].sum() labels = list(top10_gold['NOC'].values) + ['Others'] values = list(top10_gold['Predicted_Gold'].values) + [others_gold] top_colors = sns.color_palette('Set2', n_colors=len(top10_gold)) others_color = [(0.5,0.5,0.5)] colors = top_colors + others_color explode = [0]*(len(top10_gold)) + [0.1] plt.figure(figsize=(8,8)) plt.pie( values, labels=labels, colors=colors, explode=explode, autopct='%1.1f%%', startangle=160, wedgeprops={'alpha':0.9}, labeldistance=1.12, pctdistance=0.75 ) plt.title("Predicted Gold Share: Top 10 vs. Others (2D Pie)") plt.axis('equal') plt.tight_layout() plt.show()
要点说明:
数据读取:
summerOly_athletes.csv
和 summerOly_medal_counts.csv
。pd.read_csv
的路径)。建模与预测:
LinearRegression
训练模型后,分别预测 2028 年金牌数量和总奖牌数,并给出区间范围(上下界)。进步/退步分析:
可视化:
如果后续还有任何疑问或需要更改的细节,欢迎再与我交流。