Fork me on GitHub

kaggle-3-Appstore

Reason

The ever-changing mobile landscape is a challenging space to navigate. . The percentage of mobile over desktop is only increasing. Android holds about 53.2% of the smartphone market, while iOS is 43%. To get more people to download your app, you need to make sure they can easily find your app. Mobile app analytics is a great way to understand the existing strategy to drive growth and retention of future user.

With million of apps around nowadays, the following data set has become very key to getting top trending apps in iOS app store. This data set contains more than 7000 Apple iOS mobile application details. The data was extracted from the iTunes Search API at the Apple Inc website. R and linux web scraping tools were used for this study.

Inspiration

  • How does the App details contribute the user ratings?
  • Try to compare app statistics for different groups?

Explnation

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
appleStore.csv
"id" : App ID

"track_name": App Name

"size_bytes": Size (in Bytes)

"currency": Currency Type

"price": Price amount

"ratingcounttot": User Rating counts (for all version)

"ratingcountver": User Rating counts (for current version)

"user_rating" : Average User Rating value (for all version)

"userratingver": Average User Rating value (for current version)

"ver" : Latest version code

"cont_rating": Content Rating

"prime_genre": Primary Genre

"sup_devices.num": Number of supporting devices

"ipadSc_urls.num": Number of screenshots showed for display

"lang.num": Number of supported languages

"vpp_lic": Vpp Device Based Licensing Enabled

appleStore_description.csv
id : App ID
track_name: Application name
size_bytes: Memory size (in Bytes)
app_desc: Application description

import package

1
2
3
4
5
6
7
8
9
10
11
import pandas as pd
import numpy as np
import plotly_express as px
import plotly as py
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import random

import dash
import dash_html_components as html
import dash_core_components as dcc

Check data

description

appstore

merge data

Merge two Dataframe mentioned above.

add new column

size_bytes_in_MB

isNotFree

  • There are 3141 Not-Free Apps in this dataset
  • There are 4056 Free Apps in this dataset

prime_genre

主要分析的是APP类别

type of app

method-1

method-2(good method)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# 颜色的随机生成:#123456  # 加上6位数字构成
def random_color_generator(number_of_colors):
color = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)])
for i in range(number_of_colors)]
return color

trace = go.Bar(
x = cnt_srs.index,
y = cnt_srs.values,
text = text,
marker = dict(
color = random_color_generator(100),
line = dict(color='rgb(8, 48, 107)', # 柱子的外围线条颜色和宽度
width = 1.5)
),
opacity = 0.7 # 透明度设置
)

# 数据部分:一定是列表的形式
data = [trace]

# 布局设置
layout = go.Layout(
title = 'Prime genre', # 整个图的标题
margin = dict(
l = 100 # 左边距离
),
xaxis = dict(
title = 'Type of app' # 2个轴的标题
),
yaxis = dict(
title = 'Count of app'
),
width = 900, # figure的宽高
height = 500
)

fig = go.Figure(data=data, layout=layout)

fig.update_traces(textposition="outside") # 将每个占比显示出来,也就是y轴的值

fig.show()

Free vs NotFree

data
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
df_free = df[df["isNotFree"] == 0]   # free
df_NotFree = df[df["isNotFree"] == 1] # notfree

# 颜色的随机生成:#123456 # 加上6位数字构成
def random_color_generator(number_of_colors):
color = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)])
for i in range(number_of_colors)]
return color


# make subplots
fig = make_subplots(rows=2, cols=2, # 2*2的子图
vertical_spacing=0.25, # 上下子图之间的间隔
subplot_titles=("(1)Free","(2)NotFree","(3)Group Bar of Free & notFree"), # 每个子图的标题
specs=[[{"type": "xy"}, {"type": "xy"}], # 每个子图的类型
[{"rowspan": 1, "colspan": 2}, None]] # 第2行中,只有第一个位置上有图,(2,2)是空的
)
# single bar

# 1-free
cnt_srs1 = df_free['prime_genre'].value_counts()
text1 = ['{:.2f}%'.format(100 * (value / cnt_srs1.sum())) for value in cnt_srs1.values]
trace1 = go.Bar(
x = cnt_srs1.index,
y = cnt_srs1.values,
text = text1,
marker = dict(
color = random_color_generator(100),
line = dict(color='rgb(8, 48, 107)',
width = 1.5)
),
opacity = 0.7
)


# 2-notfree
cnt_srs2 = df_NotFree['prime_genre'].value_counts()
text2 = ['{:.2f}%'.format(100 * (value / cnt_srs2.sum())) for value in cnt_srs2.values]
trace2 = go.Bar(
x = cnt_srs2.index,
y = cnt_srs2.values,
text = text2,
marker = dict(
color = random_color_generator(100),
line = dict(color='rgb(8, 48, 107)',
width = 1.5)
),
opacity = 0.7
)


# group bar chart

trace3 = go.Bar(
x = cnt_srs1.index,
y = cnt_srs1.values,
text = text1,
opacity = 0.7,
# name='Free'
)


trace4 = go.Bar(
x = cnt_srs2.index,
y = cnt_srs2.values,
text = text2,
opacity = 0.7,
# name='Not-Free'
)

fig.add_trace(trace1,row=1,col=1)
fig.add_trace(trace2,row=1,col=2)
fig.add_trace(trace3,row=2,col=1)
fig.add_trace(trace4,row=2,col=1)

fig.update_layout(height=800,width=900,
title_text="Free vs NotFree", # 整个图的标题
showlegend=False) # 将右上角的图例隐藏

fig.show()

User rating depending on price_genre

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
cnt_srs = df[['prime_genre', 'user_rating']].groupby('prime_genre').mean()['user_rating'].sort_values(ascending=False)   # descending

trace = go.Bar(
x = cnt_srs.index, # the value of x
y = cnt_srs.values, # the value of y
marker = dict(
color = random_color_generator(100), # the style of bar
line = dict(color='rgb(8, 48, 107)',
width = 1.5)
),
opacity = 0.7
)
data = [trace]

layout = go.Layout(
title = 'User rating depending on Prime genre',
margin = dict(
l = 100
),
xaxis = dict(
title = 'Genre'
),
yaxis = dict(
title = 'Mean User Rating'
),
width = 800,
height = 500
)

fig = go.Figure(data=data, layout=layout)
fig.show()

image-20200708183254436

Rating change

Float columns

1
2
3
4
5
6
7
8
9
# how to make heatmap
df_temp = df.drop("id",axis=1)

fig = go.Figure(data=go.Heatmap(
z=[[1, None, 30, 50, 1], [20, 1, 60, 80, 30], [30, 60, 1, -10, 20]],
x=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'],
y=['Morning', 'Afternoon', 'Evening'],
hoverongaps = False))
fig.show()

calculate the corr

make figure

1
2
3
4
5
6
7
8
9
10
11
12
13
14
fig = go.Figure(data=go.Heatmap(
z=df_corr.values,
x=df_corr.columns.values,
y=df_corr.columns.values,
colorscale='gnbu',
))

fig.update_layout(width = 800, height = 700,
title='Pearson Correlation of float-type features',
xaxis = dict(ticks=''),
yaxis = dict(ticks='' ),
margin = dict(l = 100)
)
fig.show()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
data = [
go.Heatmap(
z = df_temp.corr(),
x = df_temp.corr().columns.values,
y = df_temp.corr().columns.values,
colorscale='magma', # 'deep', 'delta', 'dense', 'earth', 'edge', 'electric','emrld', 'fall', 'geyser', 'gnbu', 'gray', 'greens'
# reversescale=False,
)
]

layout = go.Layout(
title='Pearson Correlation of float-type features',
xaxis = dict(ticks=''),
yaxis = dict(ticks='' ),
width = 800, height = 700,
margin = dict(
l = 100
)
)

fig = go.Figure(data=data, layout=layout)
fig.show()

本文标题:kaggle-3-Appstore

发布时间:2020年07月08日 - 18:07

原始链接:http://www.renpeter.cn/2020/07/08/kaggle-3-Appstore.html

许可协议: 署名-非商业性使用-禁止演绎 4.0 国际 转载请保留原文链接及作者。

Coffee or Tea