작성일자 : 2023-12-24
Ver 0.1.1
In [1]:
# 필요 패키지 불러오기
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
In [2]:
# 현대 디렉토리 확인
current_dir = os.getcwd()
current_dir
Out[2]:
'/Users/limjongjun/Desktop/JayJay/Growth/Python/soccer-analytics/Excercise'
In [3]:
# Data가 있는 디렉토리로 변경
new_dir = '/Users/limjongjun/Desktop/JayJay/Growth/Python/soccer-analytics'
os.chdir(new_dir)
print(current_dir)
/Users/limjongjun/Desktop/JayJay/Growth/Python/soccer-analytics/Excercise
전경기 슈팅 데이터 불러오기¶
(1) 이벤트 데이터 불러오기
- 머신러닝은 데이터가 많이 확보 될 수록 성능에 유리하므로 모든 데이터 불러오기
In [4]:
competitions = [x for x in os.listdir('data/refined_events') if not x.startswith('.')]
match_events_list = []
for competition_name in competitions:
match_df = pd.read_csv(f'data/refined_events/{competition_name}/matches.csv', index_col=0, encoding='utf-8-sig')
for match_id in tqdm(match_df.index, desc=f"{competition_name + ' ':10s}"):
match_events = pd.read_pickle(f'data/refined_events/{competition_name}/{match_id}.pkl')
match_events['competition_name'] = competition_name # 어떤 대회인지 명시
match_events_list.append(match_events)
events = pd.concat(match_events_list, ignore_index=True)
events
England : 100%|██████████| 380/380 [00:00<00:00, 544.61it/s] France : 100%|██████████| 380/380 [00:00<00:00, 546.76it/s] Spain : 100%|██████████| 380/380 [00:00<00:00, 640.59it/s] Germany : 100%|██████████| 306/306 [00:00<00:00, 529.96it/s] World_Cup : 100%|██████████| 64/64 [00:00<00:00, 293.18it/s] Italy : 100%|██████████| 380/380 [00:00<00:00, 571.97it/s] European_Championship : 100%|██████████| 51/51 [00:00<00:00, 764.90it/s]
Out[4]:
match_id | event_id | period | time | team_id | team_name | player_id | player_name | event_type | sub_event_type | tags | start_x | start_y | end_x | end_y | competition_name | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2499719 | 177959171 | 1H | 2.759 | 1609 | Arsenal | 25413 | A. Lacazette | Pass | Simple pass | [Accurate] | 50.96 | 34.68 | 32.24 | 14.96 | England |
1 | 2499719 | 177959172 | 1H | 4.947 | 1609 | Arsenal | 370224 | R. Holding | Pass | High pass | [Accurate] | 32.24 | 14.96 | 53.04 | 17.00 | England |
2 | 2499719 | 177959173 | 1H | 6.542 | 1609 | Arsenal | 3319 | M. Özil | Pass | Head pass | [Accurate] | 53.04 | 17.00 | 36.40 | 19.72 | England |
3 | 2499719 | 177959174 | 1H | 8.143 | 1609 | Arsenal | 120339 | Mohamed Elneny | Pass | Head pass | [Accurate] | 36.40 | 19.72 | 42.64 | 3.40 | England |
4 | 2499719 | 177959175 | 1H | 10.302 | 1609 | Arsenal | 167145 | Bellerín | Pass | Simple pass | [Accurate] | 42.64 | 3.40 | 74.88 | 8.16 | England |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
3273243 | 1694440 | 90589035 | E2 | 984.817 | 4418 | France | 7936 | P. Pogba | Offside | [] | 92.56 | 48.96 | NaN | NaN | European_Championship | |
3273244 | 1694440 | 90589357 | E2 | 1026.418 | 9905 | Portugal | 70134 | Rui Patrício | Foul | Time lost foul | [Yellow card] | 10.40 | 26.52 | NaN | NaN | European_Championship |
3273245 | 1694440 | 90589358 | E2 | 1031.554 | 9905 | Portugal | 70134 | Rui Patrício | Free kick | Free kick | [Accurate] | 9.36 | 25.16 | 69.68 | 14.96 | European_Championship |
3273246 | 1694440 | 90589359 | E2 | 1035.343 | 9905 | Portugal | 70410 | Éder | Duel | Air duel | [Lost, Not accurate] | 69.68 | 14.96 | 69.68 | 14.96 | European_Championship |
3273247 | 1694440 | 90589040 | E2 | 1036.319 | 4418 | France | 7936 | P. Pogba | Duel | Air duel | [Won, Accurate] | 34.32 | 53.04 | NaN | NaN | European_Championship |
3273248 rows × 16 columns
(2) 슈팅 데이터 필터링
In [5]:
shots = events[
(events['event_type'] == 'Shot') | (events['sub_event_type'].isin(['Free kick shot', 'Penalty']))
].reset_index(drop=True)
shots
Out[5]:
match_id | event_id | period | time | team_id | team_name | player_id | player_name | event_type | sub_event_type | tags | start_x | start_y | end_x | end_y | competition_name | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2499719 | 177959212 | 1H | 94.596 | 1609 | Arsenal | 25413 | A. Lacazette | Shot | Shot | [Goal, Right foot, Opportunity, Position: Goal... | 91.52 | 40.12 | 104.00 | 34.00 | England |
1 | 2499719 | 177959247 | 1H | 179.855 | 1631 | Leicester City | 26150 | R. Mahrez | Shot | Shot | [Left foot, Opportunity, Position: Out center ... | 88.40 | 32.64 | 104.00 | 34.00 | England |
2 | 2499719 | 177959280 | 1H | 254.745 | 1631 | Leicester City | 14763 | S. Okazaki | Shot | Shot | [Goal, Head/body, Opportunity, Position: Goal ... | 99.84 | 32.64 | 104.00 | 34.00 | England |
3 | 2499719 | 177959289 | 1H | 425.824 | 1609 | Arsenal | 7868 | A. Oxlade-Chamberlain | Shot | Shot | [Left foot, Opportunity, Position: Out high le... | 84.24 | 45.56 | 104.00 | 34.00 | England |
4 | 2499719 | 177959429 | 1H | 815.462 | 1609 | Arsenal | 7868 | A. Oxlade-Chamberlain | Shot | Shot | [Right foot, Opportunity, Position: Goal low l... | 78.00 | 47.60 | 104.00 | 34.00 | England |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
45940 | 1694440 | 90588583 | 2H | 2776.504 | 4418 | France | 25575 | A. Gignac | Shot | Shot | [Right foot, Opportunity, Position: Post cente... | 96.72 | 39.44 | 104.00 | 34.00 | European_Championship |
45941 | 1694440 | 90589205 | E1 | 807.318 | 9905 | Portugal | 70410 | Éder | Shot | Shot | [Head/body, Opportunity, Position: Goal center... | 93.60 | 42.16 | 104.00 | 34.00 | European_Championship |
45942 | 1694440 | 90589242 | E2 | 144.487 | 9905 | Portugal | 28907 | Raphaël Guerreiro | Free kick | Free kick shot | [Left foot, Direct, Position: Post high left, ... | 79.04 | 46.92 | 104.00 | 34.00 | European_Championship |
45943 | 1694440 | 90589254 | E2 | 204.428 | 9905 | Portugal | 70410 | Éder | Shot | Shot | [Goal, Right foot, Opportunity, Position: Goal... | 80.08 | 36.72 | 104.00 | 34.00 | European_Championship |
45944 | 1694440 | 90589034 | E2 | 981.239 | 4418 | France | 134513 | A. Martial | Shot | Shot | [Right foot, Blocked, Not accurate] | 90.48 | 40.80 | 94.64 | 39.44 | European_Championship |
45945 rows × 16 columns
슈팅별 특징 추출¶
(1) 슈팅 위치 및 거리 계산
In [6]:
shot_features = pd.DataFrame(index=shots.index)
shot_features['x'] = 104 - shots['start_x']
# y축 기준으로 골대는 정중앙에 위치하고 있기 때문에 y의 절반 값인 34을 빼준다.
shot_features['y'] = shots['start_y'] - 34
# 피타고라스 정리에 의해 Distance를 구해준다. (np.linalg.norm 함수 적용)
shot_features['distance'] = shot_features[['x', 'y']].apply(np.linalg.norm, axis=1)
shot_features
Out[6]:
x | y | distance | |
---|---|---|---|
0 | 12.48 | 6.12 | 13.899813 |
1 | 15.60 | -1.36 | 15.659170 |
2 | 4.16 | -1.36 | 4.376665 |
3 | 19.76 | 11.56 | 22.893038 |
4 | 26.00 | 13.60 | 29.342120 |
... | ... | ... | ... |
45940 | 7.28 | 5.44 | 9.088014 |
45941 | 10.40 | 8.16 | 13.219138 |
45942 | 24.96 | 12.92 | 28.105658 |
45943 | 23.92 | 2.72 | 24.074152 |
45944 | 13.52 | 6.80 | 15.133750 |
45945 rows × 3 columns
(2) 슈팅 각도 계산
- 슈팅 각도도 득점 확률에 기여하는 요소이므로 계산필요
In [7]:
x = shot_features['x']
y = shot_features['y']
goal_width = 7.32
# 각도 theta를 구하는 공식
angles = np.arctan((goal_width * x) / (x ** 2 + y ** 2 - (goal_width / 2) ** 2)) * 180 / np.pi
shot_features['angle'] = np.where(angles >= 0, angles, angles + 180)
shot_features
Out[7]:
x | y | distance | angle | |
---|---|---|---|---|
0 | 12.48 | 6.12 | 13.899813 | 26.933236 |
1 | 15.60 | -1.36 | 15.659170 | 26.224941 |
2 | 4.16 | -1.36 | 4.376665 | 79.289489 |
3 | 19.76 | 11.56 | 22.893038 | 15.813597 |
4 | 26.00 | 13.60 | 29.342120 | 12.655803 |
... | ... | ... | ... | ... |
45940 | 7.28 | 5.44 | 9.088014 | 37.600623 |
45941 | 10.40 | 8.16 | 13.219138 | 25.258830 |
45942 | 24.96 | 12.92 | 28.105658 | 13.240079 |
45943 | 23.92 | 2.72 | 24.074152 | 17.184831 |
45944 | 13.52 | 6.80 | 15.133750 | 24.652934 |
45945 rows × 4 columns
(3) 슈팅 유형 및 득점 여부 추출
In [8]:
shot_features['freekick'] = (shots['event_type'] == 'Free kick').astype(int)
shot_features['header'] = shots['tags'].apply(lambda x: 'Head/body' in x).astype(int)
shot_features['goal'] = shots['tags'].apply(lambda x: 'Goal' in x).astype(int)
shot_features
Out[8]:
x | y | distance | angle | freekick | header | goal | |
---|---|---|---|---|---|---|---|
0 | 12.48 | 6.12 | 13.899813 | 26.933236 | 0 | 0 | 1 |
1 | 15.60 | -1.36 | 15.659170 | 26.224941 | 0 | 0 | 0 |
2 | 4.16 | -1.36 | 4.376665 | 79.289489 | 0 | 1 | 1 |
3 | 19.76 | 11.56 | 22.893038 | 15.813597 | 0 | 0 | 0 |
4 | 26.00 | 13.60 | 29.342120 | 12.655803 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... |
45940 | 7.28 | 5.44 | 9.088014 | 37.600623 | 0 | 0 | 0 |
45941 | 10.40 | 8.16 | 13.219138 | 25.258830 | 0 | 1 | 0 |
45942 | 24.96 | 12.92 | 28.105658 | 13.240079 | 1 | 0 | 0 |
45943 | 23.92 | 2.72 | 24.074152 | 17.184831 | 0 | 0 | 1 |
45944 | 13.52 | 6.80 | 15.133750 | 24.652934 | 0 | 0 | 0 |
45945 rows × 7 columns
(4) 슈팅 데이터 연결 및 저장
In [9]:
# 계산한 슈팅 데이터를 앞에서 생성한 DataFrame과 병합 (오른쪽에 붙이는 형식)
shots = pd.concat([shots[['competition_name'] + shots.columns[:-5].tolist()], shot_features], axis=1)
shots
Out[9]:
competition_name | match_id | event_id | period | time | team_id | team_name | player_id | player_name | event_type | sub_event_type | tags | x | y | distance | angle | freekick | header | goal | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | England | 2499719 | 177959212 | 1H | 94.596 | 1609 | Arsenal | 25413 | A. Lacazette | Shot | Shot | [Goal, Right foot, Opportunity, Position: Goal... | 12.48 | 6.12 | 13.899813 | 26.933236 | 0 | 0 | 1 |
1 | England | 2499719 | 177959247 | 1H | 179.855 | 1631 | Leicester City | 26150 | R. Mahrez | Shot | Shot | [Left foot, Opportunity, Position: Out center ... | 15.60 | -1.36 | 15.659170 | 26.224941 | 0 | 0 | 0 |
2 | England | 2499719 | 177959280 | 1H | 254.745 | 1631 | Leicester City | 14763 | S. Okazaki | Shot | Shot | [Goal, Head/body, Opportunity, Position: Goal ... | 4.16 | -1.36 | 4.376665 | 79.289489 | 0 | 1 | 1 |
3 | England | 2499719 | 177959289 | 1H | 425.824 | 1609 | Arsenal | 7868 | A. Oxlade-Chamberlain | Shot | Shot | [Left foot, Opportunity, Position: Out high le... | 19.76 | 11.56 | 22.893038 | 15.813597 | 0 | 0 | 0 |
4 | England | 2499719 | 177959429 | 1H | 815.462 | 1609 | Arsenal | 7868 | A. Oxlade-Chamberlain | Shot | Shot | [Right foot, Opportunity, Position: Goal low l... | 26.00 | 13.60 | 29.342120 | 12.655803 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
45940 | European_Championship | 1694440 | 90588583 | 2H | 2776.504 | 4418 | France | 25575 | A. Gignac | Shot | Shot | [Right foot, Opportunity, Position: Post cente... | 7.28 | 5.44 | 9.088014 | 37.600623 | 0 | 0 | 0 |
45941 | European_Championship | 1694440 | 90589205 | E1 | 807.318 | 9905 | Portugal | 70410 | Éder | Shot | Shot | [Head/body, Opportunity, Position: Goal center... | 10.40 | 8.16 | 13.219138 | 25.258830 | 0 | 1 | 0 |
45942 | European_Championship | 1694440 | 90589242 | E2 | 144.487 | 9905 | Portugal | 28907 | Raphaël Guerreiro | Free kick | Free kick shot | [Left foot, Direct, Position: Post high left, ... | 24.96 | 12.92 | 28.105658 | 13.240079 | 1 | 0 | 0 |
45943 | European_Championship | 1694440 | 90589254 | E2 | 204.428 | 9905 | Portugal | 70410 | Éder | Shot | Shot | [Goal, Right foot, Opportunity, Position: Goal... | 23.92 | 2.72 | 24.074152 | 17.184831 | 0 | 0 | 1 |
45944 | European_Championship | 1694440 | 90589034 | E2 | 981.239 | 4418 | France | 134513 | A. Martial | Shot | Shot | [Right foot, Blocked, Not accurate] | 13.52 | 6.80 | 15.133750 | 24.652934 | 0 | 0 | 0 |
45945 rows × 19 columns
In [10]:
shots.to_pickle('data/shots.pkl')