작성일자 : 2024-05-09
Ver 0.1.1
시간 구간별 선수 활동량 지표 집계 및 시각화¶
In [1]:
import os
# directory 변경
new_dir = '/Users/limjongjun/Desktop/JayJay/Growth/Python/soccer-analytics'
os.chdir(new_dir)
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from src.plot_utils import draw_pitch
(1) 가공 데이터 불러오기¶
In [3]:
match_id = 1
# file = f'data_metrica/Sample_Game_{match_id}/Sample_Game_{match_id}_IntegratedData_Reshaped.csv'
file = f'data_metrica/data/Sample_Game_{match_id}/Sample_Game_{match_id}_IntegratedData_Reshaped.csv'
traces = pd.read_csv(file, header=0, index_col=0)
traces
Out[3]:
team | player_id | frame | period | time | x | y | vx | vy | speed | distance | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | Home | 11 | 1 | 1 | 0.04 | 0.08528 | 32.80184 | NaN | NaN | NaN | NaN |
1 | Home | 11 | 2 | 1 | 0.08 | 0.09984 | 32.80184 | NaN | NaN | NaN | NaN |
2 | Home | 11 | 3 | 1 | 0.12 | 0.11856 | 32.80184 | NaN | NaN | NaN | NaN |
3 | Home | 11 | 4 | 1 | 0.16 | 0.12584 | 32.80184 | NaN | NaN | NaN | NaN |
4 | Home | 11 | 5 | 1 | 0.20 | 0.13416 | 32.80184 | NaN | NaN | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
4060163 | Away | 28 | 145002 | 2 | 5800.08 | 81.33008 | 12.86628 | 0.0 | 0.0 | 0.0 | 0.0 |
4060164 | Away | 28 | 145003 | 2 | 5800.12 | 81.33008 | 12.86628 | 0.0 | 0.0 | 0.0 | 0.0 |
4060165 | Away | 28 | 145004 | 2 | 5800.16 | 81.33008 | 12.86628 | 0.0 | 0.0 | 0.0 | 0.0 |
4060166 | Away | 28 | 145005 | 2 | 5800.20 | 81.33008 | 12.86628 | 0.0 | 0.0 | 0.0 | 0.0 |
4060167 | Away | 28 | 145006 | 2 | 5800.24 | 81.33008 | 12.86628 | 0.0 | 0.0 | 0.0 | 0.0 |
4060168 rows × 11 columns
(2) 후반전 데이터 시점 조정¶
In [4]:
h1_duration = traces[traces['period'] == 1]['time'].max()
h1_duration
Out[4]:
2850.72
In [5]:
traces[traces['period'] == 2]
Out[5]:
team | player_id | frame | period | time | x | y | vx | vy | speed | distance | |
---|---|---|---|---|---|---|---|---|---|---|---|
71268 | Home | 11 | 71269 | 2 | 2850.76 | -1.33016 | 35.83396 | NaN | NaN | NaN | NaN |
71269 | Home | 11 | 71270 | 2 | 2850.80 | -1.33016 | 35.83396 | NaN | NaN | NaN | NaN |
71270 | Home | 11 | 71271 | 2 | 2850.84 | -1.33016 | 35.83396 | NaN | NaN | NaN | NaN |
71271 | Home | 11 | 71272 | 2 | 2850.88 | -1.32808 | 35.83396 | NaN | NaN | NaN | NaN |
71272 | Home | 11 | 71273 | 2 | 2850.92 | -1.31248 | 35.83396 | NaN | NaN | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
4060163 | Away | 28 | 145002 | 2 | 5800.08 | 81.33008 | 12.86628 | 0.0 | 0.0 | 0.0 | 0.0 |
4060164 | Away | 28 | 145003 | 2 | 5800.12 | 81.33008 | 12.86628 | 0.0 | 0.0 | 0.0 | 0.0 |
4060165 | Away | 28 | 145004 | 2 | 5800.16 | 81.33008 | 12.86628 | 0.0 | 0.0 | 0.0 | 0.0 |
4060166 | Away | 28 | 145005 | 2 | 5800.20 | 81.33008 | 12.86628 | 0.0 | 0.0 | 0.0 | 0.0 |
4060167 | Away | 28 | 145006 | 2 | 5800.24 | 81.33008 | 12.86628 | 0.0 | 0.0 | 0.0 | 0.0 |
2064664 rows × 11 columns
In [6]:
# 후반전의 시작도 0초가 되도록 조정
traces.loc[traces['period'] == 2, 'time'] = (traces.loc[traces['period'] == 2, 'time'] - h1_duration).round(2)
(3) 시간 구간별 지표 산출¶
In [7]:
player_id = 8
player_trace = traces[traces['player_id'] == player_id]
segment_size = 15
time_bins = np.arange(0, 50, segment_size)
time_bins = np.append(time_bins, [100])
speed_bins = [0, 7, 15, 20, 25, 50]
stats_by_time_list = []
for period in player_trace['period'].unique():
period_trace = player_trace[player_trace['period'] == period]
time_labels = []
for i in range(len(time_bins)-2):
if period == 1:
segment_label = f'{time_bins[i]:02d}-{time_bins[i+1]:02d}'
else:
segment_label = f'{time_bins[i]+45:02d}-{time_bins[i+1]+45:02d}'
time_labels.append(segment_label)
extra_time_label = '45+' if period == 1 else '90+'
time_labels.append(extra_time_label)
period_trace['time_cat'] = pd.cut(period_trace['time'] / 60, bins=time_bins, right=True, labels=time_labels)
durations = period_trace[['time_cat', 'x']].dropna().groupby('time_cat').count() * 0.04
distances = period_trace.groupby('time_cat')['distance'].sum() #시간 카테고리로 그룹
durations.columns = ['duration']
period_stats = pd.concat([durations, distances], axis=1)
period_stats['dist_1min'] = period_stats['distance'] / period_stats['duration'] * 60
speed_cats = pd.cut(period_trace['speed'], bins=speed_bins, right=False, labels=np.arange(1, 6))
distances_by_speed = period_trace.pivot_table('distance', index='time_cat', columns=speed_cats, aggfunc='sum')
distances_by_speed.columns = [f'zone{i}_dist' for i in distances_by_speed.columns]
period_stats = pd.concat([period_stats, distances_by_speed], axis=1)
period_stats['max_speed'] = period_trace.groupby('time_cat')['speed'].max()
stats_by_time_list.append(period_stats)
stats_by_time = pd.concat(stats_by_time_list).round(2)
stats_by_time
/var/folders/_b/znjp14gd02d8lg63thqc7bm40000gn/T/ipykernel_8120/3127276049.py:23: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy period_trace['time_cat'] = pd.cut(period_trace['time'] / 60, bins=time_bins, right=True, labels=time_labels) /var/folders/_b/znjp14gd02d8lg63thqc7bm40000gn/T/ipykernel_8120/3127276049.py:23: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy period_trace['time_cat'] = pd.cut(period_trace['time'] / 60, bins=time_bins, right=True, labels=time_labels)
Out[7]:
duration | distance | dist_1min | zone1_dist | zone2_dist | zone3_dist | zone4_dist | zone5_dist | max_speed | |
---|---|---|---|---|---|---|---|---|---|
time_cat | |||||||||
00-15 | 900.00 | 1825.00 | 121.67 | 654.89 | 711.18 | 205.99 | 94.19 | 158.75 | 31.19 |
15-30 | 900.00 | 1638.40 | 109.23 | 652.11 | 668.54 | 200.87 | 70.14 | 46.74 | 29.89 |
30-45 | 900.00 | 1736.20 | 115.75 | 595.10 | 668.36 | 289.10 | 144.50 | 39.14 | 28.76 |
45+ | 150.72 | 347.39 | 138.29 | 100.89 | 146.02 | 40.58 | 55.46 | 4.44 | 29.83 |
45-60 | 900.00 | 1630.23 | 108.68 | 583.77 | 742.21 | 214.99 | 77.27 | 11.99 | 28.69 |
60-75 | 900.00 | 1728.86 | 115.26 | 543.01 | 672.68 | 220.92 | 168.70 | 123.54 | 31.98 |
75-90 | 900.00 | 1801.44 | 120.10 | 616.24 | 665.98 | 307.52 | 154.00 | 57.70 | 27.94 |
90+ | 249.52 | 412.10 | 99.09 | 152.14 | 189.60 | 42.59 | 19.28 | 8.49 | 25.67 |
(4) 시간 구간별 속도 구간별 뛴 거리 시각화¶
In [8]:
plt.figure(figsize=(12, 6))
plt.rcParams.update({'font.size': 15})
n_zones = len(distances_by_speed.columns)
colors = plt.cm.jet(np.linspace(0.9, 0.1, n_zones))
bottom = 0
for i, zone_dist in enumerate(distances_by_speed.columns[::-1]):
plt.bar(
stats_by_time.index, stats_by_time[zone_dist], #시간구간별 인덱스 사용
bottom=bottom, width=0.4, color=colors[i], label=f'Zone {5-i}'
)
if i < n_zones - 1:
bottom = bottom + stats_by_time[zone_dist]
plt.grid(axis='y', color='k', linestyle='--')
plt.axvline(3.5, color='k', linestyle='--')
plt.xlabel('time_cat')
plt.ylabel('distance')
handles, labels = plt.gca().get_legend_handles_labels()
plt.legend(handles[::-1], labels[::-1], bbox_to_anchor=(1, 1))
plt.title('Distance by Speed Zone')
plt.show()
히트맵 시각화¶
(1) 위치 히트맵 시각화¶
In [9]:
x = player_trace['x']
y = player_trace['y']
dense_heatmap = np.histogram2d(y, x, bins=(34, 52), range=[[0, 68], [0, 104]])[0]
draw_pitch('white', 'black')
img = plt.imshow(dense_heatmap[::-1], extent=[0, 104, 0, 68], vmin=0, vmax=500, cmap='RdYlGn_r', alpha=0.8)
plt.colorbar()
plt.title('Location Heatmap')
plt.show()
(2) 18-Zone Map 시각화¶
- 경기장을 18 구역으로 나눈 시각화 (경향성 및 체류시간 파악)
In [10]:
grid_size = (3, 6)
sparse_heatmap, yedges, xedges = np.histogram2d(y, x, bins=grid_size, range=[[0, 68], [0, 104]])
sparse_heatmap /= len(x)
draw_pitch('white', 'black')
img = plt.imshow(sparse_heatmap[::-1], extent=[0, 104, 0, 68], vmin=0, vmax=0.2, cmap='RdYlGn_r', alpha=0.8)
plt.colorbar()
for i in range(grid_size[0]):
for j in range(grid_size[1]):
text_x = (xedges[j] + xedges[j+1]) / 2
text_y = (yedges[i] + yedges[i+1]) / 2
plt.text(text_x, text_y, f'{sparse_heatmap[i, j]:.3f}', ha='center', va='center')
plt.title('18-Zone Map')
plt.show()
(3) 방향 히트맵 시각화¶
In [11]:
player_running_trace = player_trace[player_trace['speed'] > 15] # 낮은 속도로 뛴 데이터는 제외
vx = player_running_trace['vx'] #속도 벡터 x 성분
vy = player_running_trace['vy'] #속도 벡터 y 성분
vlim = 8
dense_heatmap = np.histogram2d(vy, vx, bins=(vlim*5, vlim*5), range=[[-vlim, vlim], [-vlim, vlim]])[0]
plt.figure(figsize=(10, 8))
img = plt.imshow(dense_heatmap[::-1], extent=[-vlim, vlim, -vlim, vlim], vmin=0, vmax=100, cmap='jet')
plt.colorbar()
plt.axvline(0, color='w', linestyle='--')
plt.axhline(0, color='w', linestyle='--')
plt.title('Direction Heatmap')
plt.show()