数据清洗占了数据分析的80%的时间
本篇主要讲主要用于数据清洗的tidyverse
包中一些处理字符串相关和数据框求交求差等函数
Talk is cheap, show you the code. 改自Linux之父Linus Torvalds
拆分以及合并某列的数据
library("tidyverse")
table3
#> # A tibble: 6 x 3
#> country year rate
#> * <chr> <int> <chr>
#> 1 Afghanistan 1999 745/19987071
#> 2 Afghanistan 2000 2666/20595360
#> 3 Brazil 1999 37737/172006362
#> 4 Brazil 2000 80488/174504898
#> 5 China 1999 212258/1272915272
#> 6 China 2000 213766/1280428583
table3 %>%
separate(rate, into = c("cases", "population"))
#> # A tibble: 6 x 4
#> country year cases population
#> <chr> <int> <chr> <chr>
#> 1 Afghanistan 1999 745 19987071
#> 2 Afghanistan 2000 2666 20595360
#> 3 Brazil 1999 37737 172006362
#> 4 Brazil 2000 80488 174504898
#> 5 China 1999 212258 1272915272
#> 6 China 2000 213766 1280428583\
# separate()是以non-alphanumeric character(非数字或字母的字符)分割,但你也可以手动以sep指定分隔符。还支持正则,比如\D是非数字,注意R里面需要转义
table3 %>%
+ separate(rate, into = c("cases", "population"), sep = "\D")
# A tibble: 6 × 4
country year cases population
<chr> <int> <chr> <chr>
1 Afghanistan 1999 745 19987071
2 Afghanistan 2000 2666 20595360
3 Brazil 1999 37737 172006362
4 Brazil 2000 80488 174504898
5 China 1999 212258 1272915272
6 China 2000 213766 1280428583
(常见通用正则:
\d 表示数字0-9,
\D 表示非数字,
\s 表示空白字符(包括空格、制表符、换行符等),
\S 表示非空白字符,
\w 表示字(字母和数字),
\W 表示非字,
\< 和 \> 分别表示以空白字符开始和结束的文本)
# 还可以给sep传递整数,代表从哪里开始截断;1代表最左边,-1代表最右边
table3 %>%
separate(year, into = c("century", "year"), sep = 2)
#> # A tibble: 6 x 4
#> country century year rate
#> <chr> <chr> <chr> <chr>
#> 1 Afghanistan 19 99 745/19987071
#> 2 Afghanistan 20 00 2666/20595360
#> 3 Brazil 19 99 37737/172006362
#> 4 Brazil 20 00 80488/174504898
#> 5 China 19 99 212258/1272915272
#> 6 China 20 00 213766/1280428583
# 有个参数extra,指分割剩下的怎么处理,默认是抛出警告即 "warn"参数,或者"drop",即扔掉,或者"merge",即合并
# 1, warn,默认的
tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>%
+ separate(x, c("one", "two", "three"))
# A tibble: 3 × 3
one two three
<chr> <chr> <chr>
1 a b c
2 d e f
3 h i j
Warning message:
Expected 3 pieces. Additional pieces discarded in 1 rows [2].
# 2. drop
tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>%
+ separate(x, c("one", "two", "three"), extra = "drop")
# A tibble: 3 × 3
one two three
<chr> <chr> <chr>
1 a b c
2 d e f
3 h i j
# 3. merge
tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>%
+ separate(x, c("one", "two", "three"), extra = "merge")
# A tibble: 3 × 3
one two three
<chr> <chr> <chr>
1 a b c
2 d e f,g
3 h i j
# 还有个参数是fill,即填充,默认靠右填充NA,可以指定靠左
tibble(x = c("a,b,c", "d,e", "f,g,i")) %>%
+ separate(x, c("one", "two", "three"))
# A tibble: 3 × 3
one two three
<chr> <chr> <chr>
1 a b c
2 d e NA
3 f g i
Warning message:
Expected 3 pieces. Missing pieces filled with `NA` in 1 rows [2].
tibble(x = c("a,b,c", "d,e", "f,g,i")) %>%
+ separate(x, c("one", "two", "three"), fill = "right")
# A tibble: 3 × 3
one two three
<chr> <chr> <chr>
1 a b c
2 d e NA
3 f g i
tibble(x = c("a,b,c", "d,e", "f,g,i")) %>%
+ separate(x, c("one", "two", "three"), fill = "left")
# A tibble: 3 × 3
one two three
<chr> <chr> <chr>
1 a b c
2 NA d e
3 f g i
unite
反向操作,即合并
table5 %>%
unite(new, century, year)
#> # A tibble: 6 x 3
#> country new rate
#> <chr> <chr> <chr>
#> 1 Afghanistan 19_99 745/19987071
#> 2 Afghanistan 20_00 2666/20595360
#> 3 Brazil 19_99 37737/172006362
#> 4 Brazil 20_00 80488/174504898
#> 5 China 19_99 212258/1272915272
#> 6 China 20_00 213766/1280428583
# 默认以"_"连接
# 可以用sep指定连接符
table5 %>%
unite(new, century, year, sep = "")
#> # A tibble: 6 x 3
#> country new rate
#> <chr> <chr> <chr>
#> 1 Afghanistan 1999 745/19987071
#> 2 Afghanistan 2000 2666/20595360
#> 3 Brazil 1999 37737/172006362
#> 4 Brazil 2000 80488/174504898
#> 5 China 1999 212258/1272915272
#> 6 China 2000 213766/1280428583
取交集和特异集合
df1 <- tribble(
~x, ~y,
1, 1,
2, 1
)
df2 <- tribble(
~x, ~y,
1, 1,
1, 2
)
intersect(df1, df2) # 取交集
#> # A tibble: 1 x 2
#> x y
#> <dbl> <dbl>
#> 1 1 1
# Note that we get 3 rows, not 4
union(df1, df2) # 取两者中特有的
#> # A tibble: 3 x 2
#> x y
#> <dbl> <dbl>
#> 1 1 1
#> 2 2 1
#> 3 1 2
setdiff(df1, df2) # 取在df1中有的,df2中没有的
#> # A tibble: 1 x 2
#> x y
#> <dbl> <dbl>
#> 1 2 1
setdiff(df2, df1) # 取在df2中有的,df1中没有的
#> # A tibble: 1 x 2
#> x y
#> <dbl> <dbl>
#> 1 1 2
连接字符串
str_c("x", "y")
#> [1] "xy"
str_c("x", "y", "z")
#> [1] "xyz"
# sep可指定分隔符
str_c("x", "y", sep = ", ")
#> [1] "x, y"
# 如果想要显示NA,需要用str_replace_na()函数
x <- c("abc", NA)
str_c("|-", x, "-|")
#> [1] "|-abc-|" NA
str_c("|-", str_replace_na(x), "-|")
#> [1] "|-abc-|" "|-NA-|"
# 自动循环连接
str_c("prefix-", c("a", "b", "c"), "-suffix")
#> [1] "prefix-a-suffix" "prefix-b-suffix" "prefix-c-suffix"
# 长度为零的自动去除
name <- "Hadley"
time_of_day <- "morning"
birthday <- FALSE
str_c(
"Good ", time_of_day, " ", name,
if (birthday) " and HAPPY BIRTHDAY",
"."
)
#> [1] "Good morning Hadley."
# 要是换成TRUE:
birthday <- TRUE
str_c(
+ "Good ", time_of_day, " ", name,
+ if (birthday) " and HAPPY BIRTHDAY",
+ "."
+ )
[1] "Good morning Hadley and HAPPY BIRTHDAY."
# 合并向量需要用collapse参数
str_c(c("x", "y", "z"), collapse = ", ")
#> [1] "x, y, z"
paste和paste0的区别?paste默认以空格连接,paste0则和str_c一样,默认直连
paste("foo", "bar")
#> [1] "foo bar"
paste0("foo", "bar")
#> [1] "foobar"
str_c("foo", "bar")
[1] "foobar"
# 但是str_c遇到NA就直接返回NA
str_c("foo",NA)
[1] NA
str_trim()用于去除空格,和python中的str.strip()一样
,但是多了左边或右边的选项。与其相反的是str_pad()
str_trim(" abc ")
#> [1] "abc"
str_trim(" abc ", side = "left")
#> [1] "abc "
str_trim(" abc ", side = "right")
#> [1] " abc"
# 参数str_pad(string, width, side = c("left", "right", "both"), pad = " ")
str_pad("abc", width = 5, side = "both", pad = "i")
[1] "iabci"
写个函数把c("a", "b", "c")
转化为字符串a, b, and c
str_commasep <- function(x, delim = ",") {
n <- length(x)
if (n == 0) {
""
} else if (n == 1) {
x
} else if (n == 2) {
# no comma before and when n == 2
str_c(x[[1]], "and", x[[2]], sep = " ")
} else {
# commas after all n - 1 elements
not_last <- str_c(x[seq_len(n - 1)], delim)
# prepend "and" to the last element
last <- str_c("and", x[[n]], sep = " ")
# combine parts with spaces
str_c(c(not_last, last), collapse = " ")
}
}
str_commasep("")
#> [1] ""
str_commasep("a")
#> [1] "a"
str_commasep(c("a", "b"))
#> [1] "a and b"
str_commasep(c("a", "b", "c"))
#> [1] "a, b, and c"
str_commasep(c("a", "b", "c", "d"))
#> [1] "a, b, c, and d"
OK,本篇完
ref: https://r4ds.had.co.nz/tidy-data.html