Analysis

Setup

library(tidyverse) # load tidyverse package
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(plotly) # load other neccessary packages

Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':

    last_plot
The following object is masked from 'package:stats':

    filter
The following object is masked from 'package:graphics':

    layout
library(ggplot2)
library(dplyr)
library(htmlwidgets)
load("data/video_game_cleaned.RData")  # import the cleaned dataset

Data Analysis

Q2. What are the top 20 games in the shooter genre?

top_shooter<- df_clean|>
  filter(genre== "Shooter")|> # filter to ensure the calculation only focus on the shooter genre
  group_by(name)|>
  summarise(shooter_rank=sum(total_sales))|>
  arrange(-shooter_rank)|>
  head(20) |> # only look at the top 20 games 
  ggplot(aes(x = reorder(name, shooter_rank), y = shooter_rank, fill = shooter_rank))+
  geom_col()+
  labs(title ="Top 20 Game in Shooter Genre",
       x = "Game",
       y = "Global sales of copies in millions",
       caption = "Source: Kaggle | Author: Shen Chon Wun")+
  theme_light()+
  coord_flip()+
  scale_fill_gradient(low="#F4F8D3",
                      high = "#73C7C7")

ggplotly(top_shooter)

Q3. Are certain genres dominant in specific regions (e.g., RPGs in Japan, shooters in North America)?

# group and summarise sales
genre_sales <- df_clean |>
  group_by(genre) |>
  summarise(na_sales = sum(na_sales, na.rm = TRUE), # calculate total sales for each region
            jp_sales = sum(jp_sales, na.rm = TRUE),
            ea_sales = sum(ea_sales, na.rm = TRUE),
            other_sales = sum(other_sales, na.rm = TRUE))

# transform the data from wide to long format (only used for this question)
genre_sales_long <- genre_sales |>
  pivot_longer(cols = c(na_sales, jp_sales, ea_sales, other_sales), # select the specific the columns
               names_to = "region", # create new column that contain the names of the specific region
               values_to = "sales") # create new column that contain corresponding sales value

# calculating sales percentage
genre_sales_percent <- genre_sales_long |>
  group_by(region) |>
  mutate(
    total_sales = sum(sales),
    percentage = sales / total_sales * 100
  ) |>
  ungroup() # remove the grouping

# replace the region name
genre_sales_percent <- genre_sales_percent |>
  mutate(region = recode(region,
                         na_sales = "North America",
                         jp_sales = "Japan",
                         ea_sales = "Europe & African",
                         other_sales = "Other"))



percentage_chart <- ggplot(genre_sales_percent, aes(x = region, y = percentage, fill = genre)) +
  geom_bar(stat = "identity", width = 0.6) + # create a bar chart with specific width 
  labs(
    title = "Percentage of Game Sales by Genre in Each Region",
    x = "Region",
    y = "Percentage of Total Sales",
    fill = "Genre"
  ) +
  theme_classic() +
  theme(axis.text.x = element_text(size = 12, face = "bold"))

ggplotly(percentage_chart)

Q4. To what extent is the rate and total sales of the games correlated?

correlation_chart <- df_clean|>
  ggplot(aes(x=critic_score, y = total_sales,group = 1, # by putting group = 1 to ensure all data point are treat as single group and only one trend line is displayed
             text= paste(
    "game:", name,
    "<br>genre:", genre, # add more details for displaying purpose
    "<br>platform:", platform)
  ))+
  geom_point(size=0.5, color="skyblue")+
  geom_smooth(method = "lm", se = TRUE, color = "darkgrey")+ # add a linear trend line
  labs(title = "Correlation between Critic Rate and Sales of the games",
       x= "Critic Rate",
       y = "Total Sales of copies in millions")+
  theme_minimal()

ggplotly(correlation_chart)
`geom_smooth()` using formula = 'y ~ x'
Warning: The following aesthetics were dropped during statistical transformation: text.
ℹ This can happen when ggplot fails to infer the correct grouping structure in
  the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
  variable into a factor?

Chart Saving

ggsave("out/genres_ranking.png", genres_by_sales, width = 8, height = 6, dpi = 300)
ggsave("out/shooter_top20.png", top_shooter, width = 8, height = 6, dpi = 300)

Save interactive plot as HTML

# save interactive plot as HTML
p3 <- ggplotly(percentage_chart)
p4 <- ggplotly(correlation_chart)
`geom_smooth()` using formula = 'y ~ x'
Warning: The following aesthetics were dropped during statistical transformation: text.
ℹ This can happen when ggplot fails to infer the correct grouping structure in
  the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
  variable into a factor?
# save the interactive plot
saveWidget(p3, file = "out/p3.html")
saveWidget(p4, file = "out/p4.html")