Data Cleaning

Setup

Download the required packages for data cleaning

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)

Import the original dataset and name it

df<-read_csv("data/video_game.csv")
Rows: 64016 Columns: 14
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (6): img, title, console, genre, publisher, developer
dbl  (6): critic_score, total_sales, na_sales, jp_sales, pal_sales, other_sales
date (2): release_date, last_update

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Data Exploration

Check the structure of the dataset

str(df)
spc_tbl_ [64,016 × 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ img         : chr [1:64016] "/games/boxart/full_6510540AmericaFrontccc.jpg" "/games/boxart/full_5563178AmericaFrontccc.jpg" "/games/boxart/827563ccc.jpg" "/games/boxart/full_9218923AmericaFrontccc.jpg" ...
 $ title       : chr [1:64016] "Grand Theft Auto V" "Grand Theft Auto V" "Grand Theft Auto: Vice City" "Grand Theft Auto V" ...
 $ console     : chr [1:64016] "PS3" "PS4" "PS2" "X360" ...
 $ genre       : chr [1:64016] "Action" "Action" "Action" "Action" ...
 $ publisher   : chr [1:64016] "Rockstar Games" "Rockstar Games" "Rockstar Games" "Rockstar Games" ...
 $ developer   : chr [1:64016] "Rockstar North" "Rockstar North" "Rockstar North" "Rockstar North" ...
 $ critic_score: num [1:64016] 9.4 9.7 9.6 NA 8.1 8.7 8.8 9.8 8.4 8 ...
 $ total_sales : num [1:64016] 20.3 19.4 16.1 15.9 15.1 ...
 $ na_sales    : num [1:64016] 6.37 6.06 8.41 9.06 6.18 9.07 9.76 5.26 8.27 4.99 ...
 $ jp_sales    : num [1:64016] 0.99 0.6 0.47 0.06 0.41 0.13 0.11 0.21 0.07 0.65 ...
 $ pal_sales   : num [1:64016] 9.85 9.71 5.49 5.33 6.05 4.29 3.73 6.21 4.32 5.88 ...
 $ other_sales : num [1:64016] 3.12 3.02 1.78 1.42 2.44 1.33 1.14 2.26 1.2 2.28 ...
 $ release_date: Date[1:64016], format: "2013-09-17" "2014-11-18" ...
 $ last_update : Date[1:64016], format: NA "2018-01-03" ...
 - attr(*, "spec")=
  .. cols(
  ..   img = col_character(),
  ..   title = col_character(),
  ..   console = col_character(),
  ..   genre = col_character(),
  ..   publisher = col_character(),
  ..   developer = col_character(),
  ..   critic_score = col_double(),
  ..   total_sales = col_double(),
  ..   na_sales = col_double(),
  ..   jp_sales = col_double(),
  ..   pal_sales = col_double(),
  ..   other_sales = col_double(),
  ..   release_date = col_date(format = ""),
  ..   last_update = col_date(format = "")
  .. )
 - attr(*, "problems")=<externalptr> 

Check the first few rows of the dataset

head(df)
# A tibble: 6 × 14
  img           title console genre publisher developer critic_score total_sales
  <chr>         <chr> <chr>   <chr> <chr>     <chr>            <dbl>       <dbl>
1 /games/boxar… Gran… PS3     Acti… Rockstar… Rockstar…          9.4        20.3
2 /games/boxar… Gran… PS4     Acti… Rockstar… Rockstar…          9.7        19.4
3 /games/boxar… Gran… PS2     Acti… Rockstar… Rockstar…          9.6        16.2
4 /games/boxar… Gran… X360    Acti… Rockstar… Rockstar…         NA          15.9
5 /games/boxar… Call… PS4     Shoo… Activisi… Treyarch           8.1        15.1
6 /games/boxar… Call… X360    Shoo… Activisi… Infinity…          8.7        14.8
# ℹ 6 more variables: na_sales <dbl>, jp_sales <dbl>, pal_sales <dbl>,
#   other_sales <dbl>, release_date <date>, last_update <date>

Summary of the dataset and check whether there are N/A values

summary(df)
     img               title             console             genre          
 Length:64016       Length:64016       Length:64016       Length:64016      
 Class :character   Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character   Mode  :character  
                                                                            
                                                                            
                                                                            
                                                                            
  publisher          developer          critic_score    total_sales   
 Length:64016       Length:64016       Min.   : 1.00   Min.   : 0.00  
 Class :character   Class :character   1st Qu.: 6.40   1st Qu.: 0.03  
 Mode  :character   Mode  :character   Median : 7.50   Median : 0.12  
                                       Mean   : 7.22   Mean   : 0.35  
                                       3rd Qu.: 8.30   3rd Qu.: 0.34  
                                       Max.   :10.00   Max.   :20.32  
                                       NA's   :57338   NA's   :45094  
    na_sales        jp_sales       pal_sales      other_sales   
 Min.   :0.00    Min.   :0.00    Min.   :0.00    Min.   :0.00   
 1st Qu.:0.05    1st Qu.:0.02    1st Qu.:0.01    1st Qu.:0.00   
 Median :0.12    Median :0.04    Median :0.04    Median :0.01   
 Mean   :0.26    Mean   :0.10    Mean   :0.15    Mean   :0.04   
 3rd Qu.:0.28    3rd Qu.:0.12    3rd Qu.:0.14    3rd Qu.:0.03   
 Max.   :9.76    Max.   :2.13    Max.   :9.85    Max.   :3.12   
 NA's   :51379   NA's   :57290   NA's   :51192   NA's   :48888  
  release_date         last_update        
 Min.   :1971-12-03   Min.   :2017-11-28  
 1st Qu.:2001-03-28   1st Qu.:2018-08-08  
 Median :2008-09-16   Median :2019-04-21  
 Mean   :2006-11-14   Mean   :2020-01-11  
 3rd Qu.:2012-12-27   3rd Qu.:2021-03-30  
 Max.   :2024-12-31   Max.   :2024-01-28  
 NA's   :7051         NA's   :46137       

Data Cleaning

Rename the uncleared column name, filter the N/A values, and select the necessary one

df_clean<- df|>
  rename(name = title, # rename the uncleared column name
         platform = console, 
         ea_sales = pal_sales)|>
  select(name, platform, genre, total_sales, na_sales, jp_sales, ea_sales, other_sales,critic_score) |> # only select the columns that are going to be use for analysis
  filter(
    !is.na(name), # filter the N/As
    !is.na(platform),
    !is.na(genre),
    !is.na(total_sales),
    !is.na(na_sales),
    !is.na(jp_sales),
    !is.na(ea_sales),
    !is.na(other_sales),
    !is.na(critic_score))

df_clean
# A tibble: 1,210 × 9
   name        platform genre total_sales na_sales jp_sales ea_sales other_sales
   <chr>       <chr>    <chr>       <dbl>    <dbl>    <dbl>    <dbl>       <dbl>
 1 Grand Thef… PS3      Acti…        20.3     6.37     0.99     9.85        3.12
 2 Grand Thef… PS4      Acti…        19.4     6.06     0.6      9.71        3.02
 3 Grand Thef… PS2      Acti…        16.2     8.41     0.47     5.49        1.78
 4 Call of Du… PS4      Shoo…        15.1     6.18     0.41     6.05        2.44
 5 Call of Du… X360     Shoo…        14.8     9.07     0.13     4.29        1.33
 6 Call of Du… X360     Shoo…        14.7     9.76     0.11     3.73        1.14
 7 Red Dead R… PS4      Acti…        13.9     5.26     0.21     6.21        2.26
 8 Call of Du… X360     Shoo…        13.9     8.27     0.07     4.32        1.2 
 9 Call of Du… PS3      Shoo…        13.8     4.99     0.65     5.88        2.28
10 Call of Du… X360     Shoo…        13.5     8.54     0.08     3.63        1.28
# ℹ 1,200 more rows
# ℹ 1 more variable: critic_score <dbl>

Export of cleaned dataset

Save the cleaned dataset in .RData format for analytical use

save(df_clean, file = "data/video_game_cleaned.RData")