My Data Camp Notes

CLEANING DATA IN R

library(dslabs)
suppressMessages(library(dplyr))
library(stringr) #necessary packages

data(murders) #get data from dslabs
head(murders) #gives the first 6 row from data set
       state abb region population total
1    Alabama  AL  South    4779736   135
2     Alaska  AK   West     710231    19
3    Arizona  AZ   West    6392017   232
4   Arkansas  AR  South    2915918    93
5 California  CA   West   37253956  1257
6   Colorado  CO   West    5029196    65
glimpse(murders) #glimpsing of the data set
Rows: 51
Columns: 5
$ state      <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "California", "…
$ abb        <chr> "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "DC", "FL",…
$ region     <fct> South, West, West, South, West, West, Northeast, South, Sou…
$ population <dbl> 4779736, 710231, 6392017, 2915918, 37253956, 5029196, 35740…
$ total      <dbl> 135, 19, 232, 93, 1257, 65, 97, 38, 99, 669, 376, 7, 12, 36…
str(murders) #structure of the data set
'data.frame':   51 obs. of  5 variables:
 $ state     : chr  "Alabama" "Alaska" "Arizona" "Arkansas" ...
 $ abb       : chr  "AL" "AK" "AZ" "AR" ...
 $ region    : Factor w/ 4 levels "Northeast","South",..: 2 4 4 2 4 4 1 2 2 2 ...
 $ population: num  4779736 710231 6392017 2915918 37253956 ...
 $ total     : num  135 19 232 93 1257 ...
is.numeric(murders$population)#if we use assert_is_numeric(murders$state) the output would be"murders$state is not class 'numeric'; it has class chr. (You should install required packages for use assert_is_ functions.)
[1] TRUE
revenue_trimmed = str_remove(murders$abb, "A") #if we would like to remove a chr from a string we can use str_remove func.
revenue_trimmed
 [1] "L"  "K"  "Z"  "R"  "C"  "CO" "CT" "DE" "DC" "FL" "G"  "HI" "ID" "IL" "IN"
[16] "I"  "KS" "KY" "L"  "ME" "MD" "M"  "MI" "MN" "MS" "MO" "MT" "NE" "NV" "NH"
[31] "NJ" "NM" "NY" "NC" "ND" "OH" "OK" "OR" "P"  "RI" "SC" "SD" "TN" "TX" "UT"
[46] "VT" "V"  "W"  "WV" "WI" "WY"
head(murders %>% mutate(new_column = revenue_trimmed)) #add a new column to a data frame
       state abb region population total new_column
1    Alabama  AL  South    4779736   135          L
2     Alaska  AK   West     710231    19          K
3    Arizona  AZ   West    6392017   232          Z
4   Arkansas  AR  South    2915918    93          R
5 California  CA   West   37253956  1257          C
6   Colorado  CO   West    5029196    65         CO
summary(murders$population) #calculate min, 1st Qu, median, mean, 3rd Qu and max values for the variable.
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
  563626  1696962  4339367  6075769  6636084 37253956 
Back to top