kaggle-3-Appstore

Reason

The ever-changing mobile landscape is a challenging space to navigate. . The percentage of mobile over desktop is only increasing. Android holds about 53.2% of the smartphone market, while iOS is 43%. To get more people to download your app, you need to make sure they can easily find your app. Mobile app analytics is a great way to understand the existing strategy to drive growth and retention of future user.

With million of apps around nowadays, the following data set has become very key to getting top trending apps in iOS app store. This data set contains more than 7000 Apple iOS mobile application details. The data was extracted from the iTunes Search API at the Apple Inc website. R and linux web scraping tools were used for this study.

Inspiration

How does the App details contribute the user ratings?
Try to compare app statistics for different groups?

Explnation

appleStore.csv
"id" : App ID

"track_name": App Name

"size_bytes": Size (in Bytes)

"currency": Currency Type

"price": Price amount

"ratingcounttot": User Rating counts (for all version)

"ratingcountver": User Rating counts (for current version)

"user_rating" : Average User Rating value (for all version)

"userratingver": Average User Rating value (for current version)

"ver" : Latest version code

"cont_rating": Content Rating

"prime_genre": Primary Genre

"sup_devices.num": Number of supporting devices

"ipadSc_urls.num": Number of screenshots showed for display

"lang.num": Number of supported languages

"vpp_lic": Vpp Device Based Licensing Enabled

appleStore_description.csv
id : App ID
track_name: Application name
size_bytes: Memory size (in Bytes)
app_desc: Application description

import package

import pandas as pd
import numpy as np
import plotly_express as px
import plotly as py
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import random

import dash
import dash_html_components as html
import dash_core_components as dcc

Check data

description

appstore

merge data

Merge two Dataframe mentioned above.

add new column

size_bytes_in_MB

isNotFree

There are 3141 Not-Free Apps in this dataset
There are 4056 Free Apps in this dataset

prime_genre

主要分析的是APP类别

type of app

method-1

method-2(good method)

# 颜色的随机生成：#123456  # 加上6位数字构成
def random_color_generator(number_of_colors):
    color = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)])
                 for i in range(number_of_colors)]
    return color

trace = go.Bar(
    x = cnt_srs.index,
    y = cnt_srs.values,
    text = text,
    marker = dict(
        color = random_color_generator(100),
        line = dict(color='rgb(8, 48, 107)',   # 柱子的外围线条颜色和宽度
                    width = 1.5)
    ),
    opacity = 0.7   # 透明度设置
)

# 数据部分：一定是列表的形式
data = [trace]

# 布局设置
layout = go.Layout(
    title = 'Prime genre',   # 整个图的标题
    margin = dict(
        l = 100   # 左边距离
    ),
    xaxis = dict(
        title = 'Type of app'   # 2个轴的标题
    ),
    yaxis = dict(
        title = 'Count of app'
    ),
    width = 900,  # figure的宽高
    height = 500
)

fig = go.Figure(data=data, layout=layout)

fig.update_traces(textposition="outside")   # 将每个占比显示出来，也就是y轴的值

fig.show()

Free vs NotFree

data

df_free = df[df["isNotFree"] == 0]   # free
df_NotFree = df[df["isNotFree"] == 1]  # notfree

# 颜色的随机生成：#123456  # 加上6位数字构成
def random_color_generator(number_of_colors):
    color = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)])
                 for i in range(number_of_colors)]
    return color


# make subplots
fig = make_subplots(rows=2, cols=2,  # 2*2的子图
                    vertical_spacing=0.25,   # 上下子图之间的间隔
                    subplot_titles=("（1）Free","（2）NotFree","（3）Group Bar of Free & notFree"),   # 每个子图的标题
                    specs=[[{"type": "xy"}, {"type": "xy"}],  # 每个子图的类型
                           [{"rowspan": 1, "colspan": 2}, None]]  # 第2行中，只有第一个位置上有图，（2，2）是空的
                   )
# single bar

# 1-free
cnt_srs1 = df_free['prime_genre'].value_counts()
text1 = ['{:.2f}%'.format(100 * (value / cnt_srs1.sum())) for value in cnt_srs1.values]
trace1 = go.Bar(
    x = cnt_srs1.index,
    y = cnt_srs1.values,
    text = text1,
    marker = dict(
        color = random_color_generator(100),
        line = dict(color='rgb(8, 48, 107)',
                    width = 1.5)
    ),
    opacity = 0.7
)


# 2-notfree
cnt_srs2 = df_NotFree['prime_genre'].value_counts()
text2 = ['{:.2f}%'.format(100 * (value / cnt_srs2.sum())) for value in cnt_srs2.values]
trace2 = go.Bar(
    x = cnt_srs2.index,
    y = cnt_srs2.values,
    text = text2,
    marker = dict(
        color = random_color_generator(100),
        line = dict(color='rgb(8, 48, 107)',
                    width = 1.5)
    ),
    opacity = 0.7
)


# group bar chart

trace3 = go.Bar(
    x = cnt_srs1.index,
    y = cnt_srs1.values,
    text = text1,
    opacity = 0.7,
#     name='Free'
)


trace4 = go.Bar(
    x = cnt_srs2.index,
    y = cnt_srs2.values,
    text = text2,
    opacity = 0.7,
#     name='Not-Free'
)

fig.add_trace(trace1,row=1,col=1)
fig.add_trace(trace2,row=1,col=2)
fig.add_trace(trace3,row=2,col=1)
fig.add_trace(trace4,row=2,col=1)

fig.update_layout(height=800,width=900,
                  title_text="Free vs NotFree",   # 整个图的标题
                  showlegend=False)   # 将右上角的图例隐藏

fig.show()

User rating depending on price_genre

cnt_srs = df[['prime_genre', 'user_rating']].groupby('prime_genre').mean()['user_rating'].sort_values(ascending=False)   # descending

trace = go.Bar(
    x = cnt_srs.index,   # the value of x
    y = cnt_srs.values,   # the value of y
    marker = dict(
        color = random_color_generator(100),  # the style of bar
        line = dict(color='rgb(8, 48, 107)',
                    width = 1.5)
    ),
    opacity = 0.7
)
data = [trace]

layout = go.Layout(
    title = 'User rating depending on Prime genre',
    margin = dict(
        l = 100
    ),
    xaxis = dict(
        title = 'Genre'
    ),
    yaxis = dict(
        title = 'Mean User Rating'
    ),
    width = 800,
    height = 500
)

fig = go.Figure(data=data, layout=layout)
fig.show()

Rating change

Float columns

# how to make heatmap
df_temp = df.drop("id",axis=1)

fig = go.Figure(data=go.Heatmap(
                   z=[[1, None, 30, 50, 1], [20, 1, 60, 80, 30], [30, 60, 1, -10, 20]],
                   x=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'],
                   y=['Morning', 'Afternoon', 'Evening'],
                   hoverongaps = False))
fig.show()

calculate the corr

make figure

fig = go.Figure(data=go.Heatmap(
    z=df_corr.values,
    x=df_corr.columns.values,
    y=df_corr.columns.values,
    colorscale='gnbu',
    ))

fig.update_layout(width = 800, height = 700,
                  title='Pearson Correlation of float-type features',
                  xaxis = dict(ticks=''),
                  yaxis = dict(ticks='' ),
                  margin = dict(l = 100)
                 )
fig.show()

data = [
    go.Heatmap(
        z = df_temp.corr(),
        x = df_temp.corr().columns.values,
        y = df_temp.corr().columns.values,
        colorscale='magma',   # 'deep', 'delta', 'dense', 'earth', 'edge', 'electric','emrld', 'fall', 'geyser', 'gnbu', 'gray', 'greens'
#         reversescale=False,
    )
]

layout = go.Layout(
    title='Pearson Correlation of float-type features',
    xaxis = dict(ticks=''),
    yaxis = dict(ticks='' ),
    width = 800, height = 700,
    margin = dict(
        l = 100
    )
)

fig = go.Figure(data=data, layout=layout)
fig.show()