load("data/video_game_cleaned.RData") # import the cleaned dataset
Data Analysis
Q1. What are the most popular genres of games in 2024 (by total sales)?
genres_by_sales<- df_clean|>group_by(genre)|>summarise(sales_rank=sum(total_sales, na.rm =TRUE)) |># summarise the total sales for each genrearrange(-sales_rank)|>ggplot( # create a plot for data visualizationaes(x =reorder(genre, sales_rank), y = sales_rank, fill = sales_rank) # reorder genres by total sales )+geom_col()+# Add a bar chartlabs(title ="Popularity of Genres 2024", # name the axes and titlex ="Genres",y ="Global sales of copies in millions",caption ="Source: Kaggle | Author: Shen Chon Wun")+theme_light()+coord_flip()+scale_fill_gradient(low="#F7CFD6", # add color gradient for the fillhigh ="#73C7C7")ggplotly(genres_by_sales)
Q2. What are the top 20 games in the shooter genre?
top_shooter<- df_clean|>filter(genre=="Shooter")|># filter to ensure the calculation only focus on the shooter genregroup_by(name)|>summarise(shooter_rank=sum(total_sales))|>arrange(-shooter_rank)|>head(20) |># only look at the top 20 games ggplot(aes(x =reorder(name, shooter_rank), y = shooter_rank, fill = shooter_rank))+geom_col()+labs(title ="Top 20 Game in Shooter Genre",x ="Game",y ="Global sales of copies in millions",caption ="Source: Kaggle | Author: Shen Chon Wun")+theme_light()+coord_flip()+scale_fill_gradient(low="#F4F8D3",high ="#73C7C7")ggplotly(top_shooter)
Q3. Are certain genres dominant in specific regions (e.g., RPGs in Japan, shooters in North America)?
# group and summarise salesgenre_sales <- df_clean |>group_by(genre) |>summarise(na_sales =sum(na_sales, na.rm =TRUE), # calculate total sales for each regionjp_sales =sum(jp_sales, na.rm =TRUE),ea_sales =sum(ea_sales, na.rm =TRUE),other_sales =sum(other_sales, na.rm =TRUE))# transform the data from wide to long format (only used for this question)genre_sales_long <- genre_sales |>pivot_longer(cols =c(na_sales, jp_sales, ea_sales, other_sales), # select the specific the columnsnames_to ="region", # create new column that contain the names of the specific regionvalues_to ="sales") # create new column that contain corresponding sales value# calculating sales percentagegenre_sales_percent <- genre_sales_long |>group_by(region) |>mutate(total_sales =sum(sales),percentage = sales / total_sales *100 ) |>ungroup() # remove the grouping# replace the region namegenre_sales_percent <- genre_sales_percent |>mutate(region =recode(region,na_sales ="North America",jp_sales ="Japan",ea_sales ="Europe & African",other_sales ="Other"))percentage_chart <-ggplot(genre_sales_percent, aes(x = region, y = percentage, fill = genre)) +geom_bar(stat ="identity", width =0.6) +# create a bar chart with specific width labs(title ="Percentage of Game Sales by Genre in Each Region",x ="Region",y ="Percentage of Total Sales",fill ="Genre" ) +theme_classic() +theme(axis.text.x =element_text(size =12, face ="bold"))ggplotly(percentage_chart)
Q4. To what extent is the rate and total sales of the games correlated?
correlation_chart <- df_clean|>ggplot(aes(x=critic_score, y = total_sales,group =1, # by putting group = 1 to ensure all data point are treat as single group and only one trend line is displayedtext=paste("game:", name,"<br>genre:", genre, # add more details for displaying purpose"<br>platform:", platform) ))+geom_point(size=0.5, color="skyblue")+geom_smooth(method ="lm", se =TRUE, color ="darkgrey")+# add a linear trend linelabs(title ="Correlation between Critic Rate and Sales of the games",x="Critic Rate",y ="Total Sales of copies in millions")+theme_minimal()ggplotly(correlation_chart)
`geom_smooth()` using formula = 'y ~ x'
Warning: The following aesthetics were dropped during statistical transformation: text.
ℹ This can happen when ggplot fails to infer the correct grouping structure in
the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
variable into a factor?
# save interactive plot as HTMLp3 <-ggplotly(percentage_chart)p4 <-ggplotly(correlation_chart)
`geom_smooth()` using formula = 'y ~ x'
Warning: The following aesthetics were dropped during statistical transformation: text.
ℹ This can happen when ggplot fails to infer the correct grouping structure in
the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
variable into a factor?
# save the interactive plotsaveWidget(p3, file ="out/p3.html")saveWidget(p4, file ="out/p4.html")