# Necessary tools to complete this tutorial
import numpy as np
import pandas as pd
import re
import matplotlib
import powerlaw
import matplotlib.pyplot as plt
from ast import literal_eval
%matplotlib inline
import seaborn as sns
sns.set(rc={'figure.figsize':(15,15)})
from tqdm import tqdm
import statsmodels.api as sm
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor


# Utility function to describe dataframes
def describe(df):
    print("This dataframe has columns: " + str(df.columns))
    print("This data frame has " +  str(len(df)) + " rows")
    return df.head()


# Utility function that makes it simple to read in salary data in a given year to pandas
def create_df_from_salary_year(path):
    df = pd.read_csv(path)
    # Determine the year that we're working with and put it in the dataframe
    year = re.search('[0-9]+', path)[0]
    df["year"] = int(year)
    return df


# Import dataframes

# Read in data about players' performance
player_stats = pd.read_csv("player_stats.csv")

# Read in salary data
seven, eight = create_df_from_salary_year("mls-salaries-2007.csv"), create_df_from_salary_year("mls-salaries-2008.csv")
nine, ten = create_df_from_salary_year("mls-salaries-2009.csv"), create_df_from_salary_year("mls-salaries-2010.csv")
eleven, twelve = create_df_from_salary_year("mls-salaries-2011.csv"), create_df_from_salary_year("mls-salaries-2012.csv")
thirteen, fourteen = create_df_from_salary_year("mls-salaries-2013.csv"), create_df_from_salary_year("mls-salaries-2014.csv")
fifteen, sixteen = create_df_from_salary_year("mls-salaries-2015.csv"), create_df_from_salary_year("mls-salaries-2016.csv")
seventeen = create_df_from_salary_year("mls-salaries-2017.csv")

years = [seven, eight, nine, ten, eleven, twelve, thirteen, fourteen, fifteen, sixteen, seventeen]

# Cocatenate all of the data from different years together into one unified dataframe
salary_df = pd.concat(years)

describe(salary_df)

This dataframe has columns: Index(['club', 'last_name', 'first_name', 'position', 'base_salary',
       'guaranteed_compensation', 'year'],
      dtype='object')
This data frame has 5553 rows


describe(player_stats)

This dataframe has columns: Index(['Player', 'Club', 'POS', 'GP', 'GS', 'MINS', 'G', 'A', 'SHTS', 'SOG',
       'GWG', 'PKG/A', 'HmG', 'RdG', 'G/90min', 'SC%', 'GWA', 'HmA', 'RdA',
       'A/90min', 'FC', 'FS', 'OFF', 'YC', 'RC', 'SOG%', 'Year', 'Season'],
      dtype='object')
This data frame has 15076 rows


# Drop mentioned columns
player_stats.drop(['SC%', 'HmG', 'RdG', 'HmA','RdA','G/90min','A/90min', "PKG/A"], axis=1, inplace=True)
# Make year lowercase to easily merge datasets together
player_stats.rename(mapper={"Year" : "year"}, axis=1, inplace=True)
describe(player_stats)

This dataframe has columns: Index(['Player', 'Club', 'POS', 'GP', 'GS', 'MINS', 'G', 'A', 'SHTS', 'SOG',
       'GWG', 'GWA', 'FC', 'FS', 'OFF', 'YC', 'RC', 'SOG%', 'year', 'Season'],
      dtype='object')
This data frame has 15076 rows


salary_df["Player"] = salary_df["first_name"] + " " + salary_df["last_name"]
salary_df.drop(["last_name", "first_name", "base_salary"], axis=1, inplace=True)
describe(salary_df)

This dataframe has columns: Index(['club', 'position', 'guaranteed_compensation', 'year', 'Player'], dtype='object')
This data frame has 5553 rows


# Join dataframes together. Each unique entry defined by a player's name and the year in which he played.
df = pd.merge(left=player_stats, right=salary_df, on=["Player", "year"])
# Drop the duplicate position and club columns
df.drop(["Club", "position"], axis=1, inplace=True)
describe(df)

This dataframe has columns: Index(['Player', 'POS', 'GP', 'GS', 'MINS', 'G', 'A', 'SHTS', 'SOG', 'GWG',
       'GWA', 'FC', 'FS', 'OFF', 'YC', 'RC', 'SOG%', 'year', 'Season', 'club',
       'guaranteed_compensation'],
      dtype='object')
This data frame has 6532 rows


# Regularize player positions by changing it so that it's exlcusively the first letter 
df["POS"] = df["POS"].astype(str).apply(lambda position: position[0])


# Split dataframes into postseason and regular season dataframes
grouped = df.groupby(df.Season)
reg = grouped.get_group("reg")
post = grouped.get_group("post")
describe(reg)

This dataframe has columns: Index(['Player', 'POS', 'GP', 'GS', 'MINS', 'G', 'A', 'SHTS', 'SOG', 'GWG',
       'GWA', 'FC', 'FS', 'OFF', 'YC', 'RC', 'SOG%', 'year', 'Season', 'club',
       'guaranteed_compensation'],
      dtype='object')
This data frame has 4250 rows


describe(post)

This dataframe has columns: Index(['Player', 'POS', 'GP', 'GS', 'MINS', 'G', 'A', 'SHTS', 'SOG', 'GWG',
       'GWA', 'FC', 'FS', 'OFF', 'YC', 'RC', 'SOG%', 'year', 'Season', 'club',
       'guaranteed_compensation'],
      dtype='object')
This data frame has 2282 rows


# How many players of each position are represented?
sns.countplot(x="POS", data=reg)

<matplotlib.axes._subplots.AxesSubplot at 0x7f620654d550>


# How many players from each team?
sns.countplot(x="club", data=reg)

<matplotlib.axes._subplots.AxesSubplot at 0x7f6206532908>


reg["club"] = reg["club"].replace("NY","NYRB")
# Same visualization but corrected
sns.countplot(x="club", data=reg)

<matplotlib.axes._subplots.AxesSubplot at 0x7f62076c6eb8>


# How many players from each year?
sns.countplot(x="year", data=reg)

<matplotlib.axes._subplots.AxesSubplot at 0x7f6206494390>


# What's the distribution of goals?
sns.displot(x="G", data=reg, kde=True, height=15, aspect=15/15)

<seaborn.axisgrid.FacetGrid at 0x7f62067f89b0>


# What's the distribution of goals? but on a log scale.
sns.displot(x="G", data=reg, kde=True, height=15, aspect=15/15)
plt.xscale('log')


# As a box plot...
sns.boxplot(x="G", data=reg)

<matplotlib.axes._subplots.AxesSubplot at 0x7f62067215c0>


# What's the distribution of assists? but on a log scale.
sns.displot(x="A", data=reg, kde=True, height=15, aspect=15/15)
plt.xscale('log')


# Boxplot for assists...
sns.boxplot(x="A", data=reg)

<matplotlib.axes._subplots.AxesSubplot at 0x7f6206b80be0>


# What's the median salary for each position? The variance?
sns.boxplot(x="POS", y="guaranteed_compensation", data=reg)

<matplotlib.axes._subplots.AxesSubplot at 0x7f6206db7e10>


# Who are the top five highest paid players for each position?
# We'll use a utility function here to reduce code reuse.
def findTopFive(pos):
    top_five = reg[reg["POS"] == pos].nlargest(5, 'guaranteed_compensation')
    return top_five
findTopFive('D')


findTopFive('M')


findTopFive('F')


# How are salaries distributed?
sns.displot(x="guaranteed_compensation", data=reg, kde=True, height=15, aspect=15/15)

<seaborn.axisgrid.FacetGrid at 0x7f6206e0a320>


sns.displot(x="guaranteed_compensation", data=reg, kde=True, height=15, aspect=15/15)
plt.xscale('log')


# Fit the data to a power law
fit = powerlaw.Fit(df.guaranteed_compensation)

Calculating best minimal value for power law fit


# Print out the optimal value of x_min after which the powerlaw relation begins...
fit.xmin

263600.0


# Print out value of the parameter alpha as described in the equation above
fit.power_law.alpha

2.0957840338865603


# What is the distance between the data and the fit?
fit.power_law.D

0.04207909594134507


fit.distribution_compare('power_law', 'exponential')

(307.130482101965, 5.029675800855953e-46)


fit.distribution_compare('power_law', 'stretched_exponential')

(2.0636498108777452, 0.3160046535476352)


fit.distribution_compare('truncated_power_law', 'stretched_exponential')

(6.632636956108646, 5.349697282362818e-15)


fit.distribution_compare('power_law', 'lognormal_positive')

(0.49551714932442437, 0.6439282777438966)


fit.distribution_compare('power_law', 'lognormal')

(-0.15051359002397469, 0.6581249336263886)


fit.distribution_compare('power_law', 'truncated_power_law')

Assuming nested distributions

(-4.568987145230979, 0.0025035813565522647)


fit.distribution_compare('truncated_power_law', 'lognormal')

(4.418473555206981, 0.0001899817766054668)


# We'll go ahead and drop the Season value, which is useless to us at this point
reg.drop(["Season"], axis=1, inplace=True)
describe(reg)

This dataframe has columns: Index(['Player', 'POS', 'GP', 'GS', 'MINS', 'G', 'A', 'SHTS', 'SOG', 'GWG',
       'GWA', 'FC', 'FS', 'OFF', 'YC', 'RC', 'SOG%', 'year', 'club',
       'guaranteed_compensation'],
      dtype='object')
This data frame has 4250 rows


# Code to clean dataset of NaN instances
def clean_dataset(df):
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)


# Create a version of reg that's suitable for ML algorithms
# We need to drop the name value for now
reg_ML = reg.drop(["Player"], axis=1)
# Get dummy values of categorical variables so that we can easily plug our data into SciKit learn
reg_ML = pd.get_dummies(reg_ML)
clean_dataset(reg_ML)
reg_ML = reg_ML.reset_index()
reg_ML = reg_ML.drop(["index"], axis=1)
describe(reg_ML)

This dataframe has columns: Index(['GP', 'GS', 'MINS', 'G', 'A', 'SHTS', 'SOG', 'GWG', 'GWA', 'FC', 'FS',
       'OFF', 'YC', 'RC', 'SOG%', 'year', 'guaranteed_compensation', 'POS_D',
       'POS_F', 'POS_M', 'club_ATL', 'club_CHI', 'club_CHV', 'club_CLB',
       'club_COL', 'club_DAL', 'club_DC', 'club_HOU', 'club_KC', 'club_LA',
       'club_MNUFC', 'club_MTL', 'club_NE', 'club_NYCFC', 'club_NYRB',
       'club_None', 'club_ORL', 'club_PHI', 'club_POR', 'club_RSL', 'club_SEA',
       'club_SJ', 'club_TFC', 'club_TOR', 'club_VAN'],
      dtype='object')
This data frame has 3460 rows


# Utility function to put SelectKBest results in a readable format
def kBest(fit, X):
    scores = pd.DataFrame(fit.scores_)
    cols = pd.DataFrame(X.columns)
    combined = pd.concat([cols, scores], axis=1)
    combined.columns = ["Feature", "Score"]
    return combined.nlargest(10, 'Score')


# Set independent variables 
X = reg_ML.loc[:, reg_ML.columns != 'guaranteed_compensation']
# Set dependent variable
y = reg_ML['guaranteed_compensation']


# Fit f_regression model
f_test_results = SelectKBest(score_func=f_regression, k=10)
fit = f_test_results.fit(X, y)
kBest(fit, X)


# Fit mutual info regression model
mutual_info_results = SelectKBest(score_func=mutual_info_regression, k=10)
fit = mutual_info_results.fit(X, y)
kBest(fit, X)


# Utility function to plot most important features of model
def plot_importance(model, X):
    df = pd.Series(model.feature_importances_, index=X.columns)
    df.nlargest(10).plot(kind='barh')
    plt.show()
model = ExtraTreesRegressor()
model.fit(X, y)
plot_importance(model, X)


# Play around with hyperparameter max_depth...
model = ExtraTreesRegressor(max_depth=50)
model.fit(X, y)
plot_importance(model, X)


# Play around with hyperparameter max_depth...
model = ExtraTreesRegressor(max_depth=25)
model.fit(X, y)
plot_importance(model, X)


# Play around with hyperparameter max_depth...
model = ExtraTreesRegressor(max_depth=38)
model.fit(X, y)
plot_importance(model, X)


sns.set(rc={'figure.figsize':(25,25)})
# There's simply too many teams to put everyhing on a correlation matrix and still have a useful visual...
filterOne = reg_ML.loc[:,~reg_ML.columns.str.startswith('club')]
corrMatrix = filterOne.corr()
topCorr = corrMatrix.index
sns.heatmap(reg_ML[topCorr].corr(), annot=True, cmap="RdYlGn")

<matplotlib.axes._subplots.AxesSubplot at 0x7f620f3a16d8>


 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
 tree = ExtraTreesRegressor(random_state=0).fit(X_train, y_train)
 tree.score(X_test, y_test)

0.4267227119957868


 tree = ExtraTreesRegressor(random_state=0, max_depth=50).fit(X_train, y_train)
 tree.score(X_test, y_test)

0.4267227119957868


 tree = ExtraTreesRegressor(random_state=0, max_depth=40).fit(X_train, y_train)
 tree.score(X_test, y_test)

0.4267227119957868


 # Can get our largest possible R^2 here...
 tree = ExtraTreesRegressor(random_state=0, max_depth=38).fit(X_train, y_train)
 tree.score(X_test, y_test)

0.46150241866427605


# Does bootstraping help?
tree = ExtraTreesRegressor(random_state=0, max_depth=38, bootstrap=True).fit(X_train, y_train)
tree.score(X_test, y_test)
# Nope

0.3629694930220273


# Playing around with the minimum number of samples needed to form a leaf...
tree = ExtraTreesRegressor(random_state=0, max_depth=38, min_samples_leaf=1).fit(X_train, y_train)
# Any number greater than 1 reduces accuracy by quite a bit, indicating that the model simply needs more data.
tree.score(X_test, y_test)

0.46150241866427605


neigh = KNeighborsRegressor()
neigh.fit(X_train, y_train)
neigh.score(X_test, y_test)

0.03471634323103823


# That's really bad, can we do better with a low number of neighbors?
neigh = KNeighborsRegressor(n_neighbors=3)
neigh.fit(X_train, y_train)
neigh.score(X_test, y_test)

-0.0023612746984396082


# Yikes, let's see what happens if we make it larger.
neigh = KNeighborsRegressor(n_neighbors=50)
neigh.fit(X_train, y_train)
neigh.score(X_test, y_test)

0.07237844764579182


# That's more like it!
neigh = KNeighborsRegressor(n_neighbors=100)
neigh.fit(X_train, y_train)
neigh.score(X_test, y_test)
# But you can only go that way for so long.

0.04788003238107508

	club	last_name	first_name	position	base_salary	guaranteed_compensation	year
0	CHI	Armas	Chris	M	225000.0	225000.0	2007
1	CHI	Banner	Michael	M	12900.0	12900.0	2007
2	CHI	Barrett	Chad	F	41212.5	48712.5	2007
3	CHI	Blanco	Cuauhtemoc	F	2492316.0	2666778.0	2007
4	CHI	Brown	C.J.	D	106391.0	106391.0	2007

What Determines A Soccer Player's Salary?¶

By Peter Feeney¶

Introduction¶

Required Tools¶

Data Collection¶

Tidying Player Stats¶

Should we drop any columns in player stats?¶

Tidying the Salary Dataframe¶

More tidying¶

Exploratory Data Analysis & Visualization¶

Power Laws¶

Analysis! (With machine learning)¶

Univariate feature selection¶

Results¶

Feature Importance¶

Analysis¶

Results¶

Predictions¶

Results¶

CONCLUSIONS¶

	Player	Club	POS	GP	GS	MINS	G	A	SHTS	SOG	...	RdA	A/90min	FC	FS	OFF	YC	RC	SOG%	Year	Season
0	Roy Lassiter	TB	F	30	30	2580	27	4	76	49	...	2	0.14	20	39	70	2	0	64.47	1996	reg
1	Raul Diaz Arce	DC	F	28	28	2351	23	2	100	49	...	2	0.08	32	26	35	6	1	49.00	1996	reg
2	Eduardo Hurtado	LA	F	26	26	2323	21	7	87	56	...	3	0.27	48	26	25	5	0	64.37	1996	reg
3	Preki	KC	M	32	32	2880	18	13	140	61	...	4	0.41	26	44	7	3	0	43.57	1996	reg
4	Brian McBride	CLB	F	28	28	2307	17	3	79	44	...	1	0.12	21	46	10	0	0	55.70	1996	reg

	club	position	guaranteed_compensation	year	Player
0	CHI	M	225000.0	2007	Chris Armas
1	CHI	M	12900.0	2007	Michael Banner
2	CHI	F	48712.5	2007	Chad Barrett
3	CHI	F	2666778.0	2007	Cuauhtemoc Blanco
4	CHI	D	106391.0	2007	C.J. Brown

	Player	POS	GP	GS	MINS	G	A	SHTS	SOG	GWG	...	FC	FS	OFF	YC	RC	SOG%	year	Season	club	guaranteed_compensation
0	Luciano Emilio	F	29	28	2410	20	1	79	47	4	...	35	39	21	2	0	59.49	2007	reg	DC	293125.0
1	Luciano Emilio	F	2	1	77	0	0	2	2	0	...	0	1	0	0	0	100.00	2007	post	DC	293125.0
2	Juan Pablo Angel	F	24	24	2125	19	5	97	53	5	...	31	20	42	2	1	54.64	2007	reg	NY	1593750.0
3	Juan Pablo Angel	F	2	2	154	0	0	7	4	0	...	5	3	2	0	0	57.14	2007	post	NY	1593750.0
4	Taylor Twellman	F	26	25	2283	16	3	90	55	5	...	13	41	21	2	0	61.11	2007	reg	NE	350008.0

	Player	POS	GP	GS	MINS	G	A	SHTS	SOG	GWG	...	FC	FS	OFF	YC	RC	SOG%	year	Season	club	guaranteed_compensation
4669	Omar Gonzalez	D	30	29	2577	1	3	13	6	1	...	34	16	0	3	0	46.15	2015	reg	LA	1450000.0
3894	Omar Gonzalez	D	22	22	1902	4	2	18	10	1	...	28	19	1	5	1	55.56	2014	reg	LA	1250000.0
5453	Liam Ridgewell	D	22	22	1980	1	0	12	4	1	...	25	30	2	6	0	33.33	2016	reg	POR	1250000.0
3963	Liam Ridgewell	D	15	15	1350	2	1	10	4	0	...	16	14	2	3	0	40.00	2014	reg	POR	1200000.0
4876	Liam Ridgewell	D	32	32	2868	0	1	12	2	0	...	19	25	2	6	1	16.67	2015	reg	POR	1150000.0

	Player	POS	GP	GS	MINS	G	A	SHTS	SOG	GWG	...	FC	FS	OFF	YC	RC	SOG%	year	Season	club	guaranteed_compensation
3806	Clint Dempsey	M	26	23	2132	15	10	113	47	5	...	34	52	12	4	0	41.59	2014	reg	SEA	6695188.75
242	David Beckham	M	5	2	252	0	2	8	0	0	...	2	7	0	0	0	0.00	2007	reg	LA	6500000.04
511	David Beckham	M	25	25	2248	5	10	35	12	1	...	14	37	4	6	0	34.29	2008	reg	LA	6500000.04
1099	David Beckham	M	11	11	889	2	3	19	9	2	...	14	9	0	2	1	47.37	2009	reg	LA	6500000.04
1546	David Beckham	M	7	5	466	2	1	9	6	1	...	4	6	0	2	0	66.67	2010	reg	LA	6500000.04

	Player	POS	GP	GS	MINS	G	A	SHTS	SOG	GWG	...	FC	FS	OFF	YC	SOG%	year	Season	club	guaranteed_compensation
4420	Sebastian Giovinco	F	33	32	2775	22	16	181	73	7	...	43	90	34	4	40.33	2015	reg	TOR	7115555.67
5142	Sebastian Giovinco	F	28	28	2418	17	15	177	59	4	...	32	69	29	3	33.33	2016	reg	TOR	7115555.67
5815	Sebastian Giovinco	F	25	25	2057	16	6	128	52	4	...	30	54	10	4	40.62	2017	reg	TOR	7115555.67
3822	Jermain Defoe	F	19	17	1529	11	2	58	25	5	...	22	23	25	5	43.10	2014	reg	TOR	6180000.00
4426	David Villa	F	30	29	2514	18	8	138	64	6	...	29	49	47	4	46.38	2015	reg	NYCFC	5610000.00

	Feature	Score
3	G	457.691816
5	SHTS	451.616350
6	SOG	403.311224
4	A	318.907212
7	GWG	308.503087
8	GWA	180.296115
11	OFF	161.357512
10	FS	158.146575
1	GS	74.472417
2	MINS	71.896782

	Feature	Score
15	year	0.411701
2	MINS	0.109341
10	FS	0.103177
5	SHTS	0.102663
6	SOG	0.096148
0	GP	0.091924
1	GS	0.086884
4	A	0.075986
7	GWG	0.066230
9	FC	0.066095