Fork me on GitHub

kaggle-2-Top50 of Songs

kaggle-top50

top50的数据是kaggle官网上关于一个音乐的数据集。

There are 50 songs and 13 variables to be explored

新知识

数据本身是比较完美的,没有涉及到太多的数据预处理的工作,主要是学习到了多种图形的绘制

  • 直方图
  • 直方图+折线
  • 热力图
  • 饼图
  • 等高线图

属性

image-20200116124935398

分析过程

导入库和包

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy import stats
import squarify as sq
from pandas.plotting import scatter_matrix
import seaborn as sns
import sklearn
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import MinMaxScaler, LabelEncoder # 预处理模块
from sklearn.linear_model import LinearRegression # 线性回归
from sklearn.model_selection import train_test_split,cross_val_score, KFold # 数据分离,交叉验证,K折验证
from sklearn import metrics # 矩阵模块
from sklearn.metrics import confusion_matrix, classification_report # 混淆矩阵,分类报告
%matplotlib inline


#提供汉字支持
mpl.rcParams["font.family"]="sans-serif"
mpl.rcParams["font.sans-serif"]=u'SimHei'

数据查看

1
2
3
4
5
6
filename='/Users/piqianchao/data-visualization/top50.csv'
data = pd.read_csv(filename
,encoding = "ISO-8859-1" # 解决UnicodeError问题
,engine='python'
,index_col=0) # 解决已知文件的第一列当做属性问题
data.head()

image-20200116125324481

属性重命名rename

1
data.rename(columns={'Track.Name':'track_name','Artist.Name':'artist_name','Beats.Per.Minute':'beats_per_minute','Loudness..dB..':'Loudness(dB)','Valence.':'Valence','Length.':'Length', 'Acousticness..':'Acousticness','Speechiness.':'Speechiness'},inplace=True)

Calculating the number of songs of each genre

1
2
3
popular_genre = data.groupby('Genre').size()  # 根据类别分组,再统计每个类别多少首歌
print(popular_genre)
genre_list = data['Genre'].values.tolist() # 将每个类别转成列表形式

image-20200116125455420

Calculating the number of songs by each of the artists

1
2
3
popular_artist = data.groupby('artist_name').size()   # 统计每个作家几首歌
print(popular_artist)
artist_list = data['artist_name'].values.tolist() # 作家的名字转成列表

查看属性的统计信息

1
2
pd.set_option('precision', 3)  # 设置最多显示的小数位
data.describe() # 查看统计信息

image-20200116125615806

Finding out the skew for each attribute

找出每个属性的偏度skew

1
2
skew = data.skew()  # skew是偏态,偏态系数
print(skew)

image-20200116125718076

1
2
3
4
5
6
7
8
9
10
11
transform = np.asarray(data[['Liveness']].values)  # 取出每个Liveness的值,转成ndarray型数据
print(type(transform))
data_transform = stats.boxcox(transform)[0]

plt.hist(data['Liveness'], bins=10) # 原始数据
plt.title("original data")
plt.show()

plt.hist(data_transform, bins=10) # 修正偏态之后的数据
plt.title("skew corrected data")
plt.show()

image-20200116125914669

如何在直方图的基础上画出折线趋势

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
transform1 = np.asarray(data[['Popularity']].values)
data_transform1 = stats.boxcox(transform1)[0]
# 类似上面的做法,画出直方图
# plt.hist(data['Popularity'],bins=10) #original data
# plt.show()
# plt.hist(data_transform1,bins=10) #corrected skew data
# plt.show()

sns.distplot(data['Popularity'],bins=10,kde=True,kde_kws={"color":"k", "lw":2, "label":"KDE"}, color='blue')
plt.title("original data")
plt.show()

sns.distplot(data_transform1, bins=10, kde=True, kde_kws={"color":"k", "lw":2, "label":"KDE"}, color='green')
plt.title("skew corrected data")
plt.show()

image-20200116125959735

Bar graph to see the number of songs of each genre

1
2
3
4
5
6
7
8
9
fig, ax = plt.subplots(figsize=(30,12))  # 指定画布大小
length = np.arange(len(popular_genre))
plt.bar(length, popular_genre, color='g',edgecolor='black',alpha=0.7)

plt.xticks(length, genre_list) # 显示的是横轴上的每个刻度
plt.title("Most popular genre", fontsize=28)
plt.xlabel("Genre", fontsize=25)
plt.ylabel("Number On Songs", fontsize=25)
plt.show()

image-20200116130100729

相关系数correction

如何求解相关系数

1
2
3
4
pd.set_option('display.width', 100)   # 每行最多显示的数据量为100,多的话就隔行再显示
pd.set_option('precision', 3) # 最多精确的小数位
correclation = data.corr(method='spearman') # method系数相关:pearson 线性数据之间的相关性;kendall分类变量相关性,无序序列;spearman 非线性的,非正态的数据的相关系数
print(correclation)

image-20200116130212099

8.2 根据相关系数画出热力图

1
2
3
plt.figure(figsize=(10,10))
plt.title("Correclation heatmap")
sns.heatmap(correclation, annot=True,vmin=-1, vmax=1,cmap="GnBu_r", center=1)

image-20200116130323884

1
2
3
4
5
6
7
8
9
10
fig, ax=plt.subplots(figsize=(12,12))
length=np.arange(len(popular_artist))
plt.barh(length, popular_artist,color='r',edgecolor='black',alpha=0.7)
# plt.barh(y, width, height=0.8, left=None, *, align='center', **kwargs)
plt.yticks(length, artist_list) # y轴上的刻度

plt.title("Most popular artists", fontsize=18)
plt.ylabel("Artists", fontsize=18) # 横纵轴的标签
plt.xlabel("Number of songs", fontsize=16)
plt.show()

image-20200116130406443

Analysing the relationship between energy and loudness

1
2
fig = plt.subplots(figsize=(10,10))
sns.regplot(x='Energy', y='Loudness(dB)', data=data, color='black')

image-20200116130447221

Dependence between energy and popularity

1
2
3
4
fig = plt.subplots(figsize=(10,10))
plt.title('Dependence between energy and popularity')
sns.regplot(x='Energy', y='Popularity', ci=None, data=data)
sns.kdeplot(data.Energy, data.Popularity)

image-20200116130523744

1
2
3
4
plt.figure(figsize=(14,8))
sq.plot(sizes=data.Genre.value_counts(), label=data['Genre'].unique(), alpha=0.8)
plt.axis('off')
plt.show()

image-20200116130625046

Pie charts 饼图

通过每个歌手和其歌曲数目制作饼图

1
2
3
4
5
6
7
8
labels = data.artist_name.value_counts().index  # 每小块的标签
sizes = data.artist_name.value_counts().values # 每块的大小
colors = ['red', 'yellowgreen', 'lightcoral', 'lightskyblue','cyan', 'green', 'black','yellow']
plt.figure(figsize = (10,10))
plt.pie(sizes, labels=labels,colors=colors) # 画图
autopct = ("%1.1f%%")
plt.axis('equal')
plt.show()

image-20200116130700377

Linear Regression

数据构建和TTS

1
2
3
4
5
6
7
8
# 构建训练集和测试集
x = data.loc[:, ['Energy','Danceability','Length','Loudness(dB)','Acousticness']].values
y = data.loc[:, 'Popularity'].values

X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.3)

reg = LinearRegression()
reg.fit(X_train, y_train)

预测

1
2
3
4
# 进行预测,真实值和预测值之间的比较
y_pred = reg.predict(X_test)
data_output = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(data_output)

image-20200116130827916

1
2
3
4
5
6
7
8
9
10
11
# 计算LR的准确率:MAE:mean absolute error;MSE: mean sqaured error
print("MAE", metrics.mean_absolute_error(y_test, y_pred))
print("MSE", metrics.mean_squared_error(y_test, y_pred))
print("Root MSE:", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

# 预测值和真实的测试值之间的散点图
plt.figure(figsize=(10,10))
plt.plot(y_pred, y_test, color='black', linestyle='dashed',marker='*',markerfacecolor='red',markersize=10)
plt.title("Error analsis")
plt.xlabel("Predicted values")
plt.ylabel("Test values")

image-20200116130909271

交叉验证

1
2
3
4
5
6
7
8
x = data.loc[:, ['Energy', 'Danceability']].values
y = data.loc[:, 'Popularity'].values
reg = LinearRegression()
mse = cross_val_score(reg, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
mean_mse = np.mean(mse)
print(mean_mse)
diff = metrics.mean_squared_error(y_test, y_pred) - abs(mean_mse)
print(diff)

本文标题:kaggle-2-Top50 of Songs

发布时间:2020年01月16日 - 13:01

原始链接:http://www.renpeter.cn/2020/01/16/kaggle-2-top50.html

许可协议: 署名-非商业性使用-禁止演绎 4.0 国际 转载请保留原文链接及作者。

Coffee or Tea