4  dplyr: Row-wise operations

Published

March 14, 2023

Modified

January 8, 2024

This vignette centers on the use of rowwise(), which was introduced in 2020 with dplyr 1.0.0. See dplyr 1.0.0 notes.

4.1 Creating

rowwise() works like group_by() in the sense that it doesn’t change what the data looks like; it changes how dplyr verbs operate on the data. rowwise() switches the orientation of dplyr verbs on data frames to work across rows instead of down columns.

df <- tibble(x = 1:2, y = 3:4, z = 5:6)

# Normal
df %>% mutate(m = mean(c(x, y, z)))
#> # A tibble: 2 × 4
#>       x     y     z     m
#>   <int> <int> <int> <dbl>
#> 1     1     3     5   3.5
#> 2     2     4     6   3.5

# rowwise
df %>% rowwise() %>% mutate(m = mean(c(x, y, z)))
#> # A tibble: 2 × 4
#> # Rowwise: 
#>       x     y     z     m
#>   <int> <int> <int> <dbl>
#> 1     1     3     5     3
#> 2     2     4     6     4

You can supply “identifier” variables in rowwise(). These variables are preserved when you call summarise() and so function similarly to variables in group_by(). You can remove the grouping with ungroup() or .groups.

df <- tibble(name = c("Mara", "Hadley"), x = 1:2, y = 3:4, z = 5:6)

df %>% 
  rowwise() %>% 
  summarise(m = mean(c(x, y, z)))
#> # A tibble: 2 × 1
#>       m
#>   <dbl>
#> 1     3
#> 2     4

df %>% 
  rowwise(name) %>% 
  summarise(m = mean(c(x, y, z)), .groups = "drop")
#> # A tibble: 2 × 2
#>   name       m
#>   <chr>  <dbl>
#> 1 Mara       3
#> 2 Hadley     4

4.2 Per row summary statistics

Use mutate() to add a new column to each row; use summarise() to return just the summary column(s) and any “identifier” columns.

Data that might represent scores for students on tests:

df <- tibble(id = 1:6, w = 10:15, x = 20:25, y = 30:35, z = 40:45)

Sum of total scores as a new column with mutate() or on its own with summarise():

# mutate
df %>% 
  rowwise() %>% 
  mutate(total = sum(c(w, x, y, z)))
#> # A tibble: 6 × 6
#> # Rowwise: 
#>      id     w     x     y     z total
#>   <int> <int> <int> <int> <int> <int>
#> 1     1    10    20    30    40   100
#> 2     2    11    21    31    41   104
#> 3     3    12    22    32    42   108
#> 4     4    13    23    33    43   112
#> 5     5    14    24    34    44   116
#> 6     6    15    25    35    45   120

# summarise
df %>% 
  rowwise() %>% 
  summarise(total = sum(c(w, x, y, z)), .groups = "drop")
#> # A tibble: 6 × 1
#>   total
#>   <int>
#> 1   100
#> 2   104
#> 3   108
#> 4   112
#> 5   116
#> 6   120

You can use c_across() to access tidy selection syntax and “identifier” columns are not used within computations.

df %>% 
  rowwise(id) %>% 
  mutate(total = sum(c_across(w:z)))
#> # A tibble: 6 × 6
#> # Rowwise:  id
#>      id     w     x     y     z total
#>   <int> <int> <int> <int> <int> <int>
#> 1     1    10    20    30    40   100
#> 2     2    11    21    31    41   104
#> 3     3    12    22    32    42   108
#> 4     4    13    23    33    43   112
#> 5     5    14    24    34    44   116
#> 6     6    15    25    35    45   120

df %>% 
  rowwise(id) %>% 
  mutate(total = sum(c_across(where(is.numeric))))
#> # A tibble: 6 × 6
#> # Rowwise:  id
#>      id     w     x     y     z total
#>   <int> <int> <int> <int> <int> <int>
#> 1     1    10    20    30    40   100
#> 2     2    11    21    31    41   104
#> 3     3    12    22    32    42   108
#> 4     4    13    23    33    43   112
#> 5     5    14    24    34    44   116
#> 6     6    15    25    35    45   120

You can combine rowwise analysis with column-wise analysis such as computing the proportion of the total for each column.

df %>% 
  rowwise(id) %>% 
  mutate(total = sum(c_across(w:z))) %>% 
  ungroup() %>% 
  mutate(across(w:z, \(x) x / total))
#> # A tibble: 6 × 6
#>      id     w     x     y     z total
#>   <int> <dbl> <dbl> <dbl> <dbl> <int>
#> 1     1 0.1   0.2   0.3   0.4     100
#> 2     2 0.106 0.202 0.298 0.394   104
#> 3     3 0.111 0.204 0.296 0.389   108
#> 4     4 0.116 0.205 0.295 0.384   112
#> 5     5 0.121 0.207 0.293 0.379   116
#> 6     6 0.125 0.208 0.292 0.375   120

4.3 List-columns

rowwise() pairs well with list columns and helps to avoid for loops, use of the apply() functions, or purr::map() functions. If you want to get the length of list columns you either have to use the rather esoteric lengths() function or apply() or map() functions. But you can now do this with rowwise().

df <- tibble(x = list(1, 2:3, 4:6))

# Base apply
df %>% mutate(l = sapply(x, length))
#> # A tibble: 3 × 2
#>   x             l
#>   <list>    <int>
#> 1 <dbl [1]>     1
#> 2 <int [2]>     2
#> 3 <int [3]>     3

# purrr::map
df %>% mutate(l = purrr::map_int(x, length))
#> # A tibble: 3 × 2
#>   x             l
#>   <list>    <int>
#> 1 <dbl [1]>     1
#> 2 <int [2]>     2
#> 3 <int [3]>     3

# rowwise
df %>% 
  rowwise() %>% 
  mutate(l = length(x))
#> # A tibble: 3 × 2
#> # Rowwise: 
#>   x             l
#>   <list>    <int>
#> 1 <dbl [1]>     1
#> 2 <int [2]>     2
#> 3 <int [3]>     3

4.3.1 Subsetting list-columns

List-columns provide a good way to understand how grouping with rowwise() works differently from grouping with group_by() even when each group happens to have only one row. In rowwise() each group always only has one row.

df <- tibble(g = 1:2, y = list(1:3, "a"))
gf <- df %>% group_by(g)
rf <- df %>% rowwise(g)

# group_by
gf %>% mutate(type = typeof(y), length = length(y))
#> # A tibble: 2 × 4
#> # Groups:   g [2]
#>       g y         type  length
#>   <int> <list>    <chr>  <int>
#> 1     1 <int [3]> list       1
#> 2     2 <chr [1]> list       1

# rowwise
rf %>% mutate(type = typeof(y), length = length(y))
#> # A tibble: 2 × 4
#> # Rowwise:  g
#>       g y         type      length
#>   <int> <list>    <chr>      <int>
#> 1     1 <int [3]> integer        3
#> 2     2 <chr [1]> character      1

mutate() slices that data to pass to length(y) with [ in the grouped data frame but uses [[ subsetting for row-wise mutates.

4.3.2 Modelling

rowwise() and list-columns provides a variety of solutions to modelling problems. You can create a nested data frame that is more explicit than group_by() and returns a rowwise data frame.

by_cyl <- mtcars %>% nest_by(cyl)
by_cyl
#> # A tibble: 3 × 2
#> # Rowwise:  cyl
#>     cyl                data
#>   <dbl> <list<tibble[,10]>>
#> 1     4           [11 × 10]
#> 2     6            [7 × 10]
#> 3     8           [14 × 10]

Now we can make one model per row and one set of predictions per row:

mods <- by_cyl %>% 
  mutate(mod = list(lm(mpg ~ wt, data = data))) %>% 
  mutate(pred = list(predict(mod, data)))
mods
#> # A tibble: 3 × 4
#> # Rowwise:  cyl
#>     cyl                data mod    pred      
#>   <dbl> <list<tibble[,10]>> <list> <list>    
#> 1     4           [11 × 10] <lm>   <dbl [11]>
#> 2     6            [7 × 10] <lm>   <dbl [7]> 
#> 3     8           [14 × 10] <lm>   <dbl [14]>

You can then summarize the model in a variety of ways or access the parameters of each model:

# Summary
mods %>% summarise(rsq = summary(mod)$r.squared)
#> `summarise()` has grouped output by 'cyl'. You can override using the `.groups`
#> argument.
#> # A tibble: 3 × 2
#> # Groups:   cyl [3]
#>     cyl   rsq
#>   <dbl> <dbl>
#> 1     4 0.509
#> 2     6 0.465
#> 3     8 0.423

# Access parameters
mods %>% reframe(broom::tidy(mod))
#> # A tibble: 6 × 6
#>     cyl term        estimate std.error statistic    p.value
#>   <dbl> <chr>          <dbl>     <dbl>     <dbl>      <dbl>
#> 1     4 (Intercept)    39.6      4.35       9.10 0.00000777
#> 2     4 wt             -5.65     1.85      -3.05 0.0137    
#> 3     6 (Intercept)    28.4      4.18       6.79 0.00105   
#> 4     6 wt             -2.78     1.33      -2.08 0.0918    
#> 5     8 (Intercept)    23.9      3.01       7.94 0.00000405
#> 6     8 wt             -2.19     0.739     -2.97 0.0118

4.4 Repeated function calls

rowwise() can also return a vector of length greater than one if the result is a list. This means that rowwise() and mutate() provide an elegant way to call a function many times with varying arguments, storing the outputs alongside the inputs. For instance you can store argument parameters for runif() in a data frame and then have the results presented in a list column.

df <- tribble(
  ~ n, ~ min, ~ max,
    1,     0,     1,
    2,    10,   100,
    3,   100,  1000,
)

df %>% 
  rowwise() %>% 
  mutate(data = list(runif(n, min, max)))
#> # A tibble: 3 × 4
#> # Rowwise: 
#>       n   min   max data     
#>   <dbl> <dbl> <dbl> <list>   
#> 1     1     0     1 <dbl [1]>
#> 2     2    10   100 <dbl [2]>
#> 3     3   100  1000 <dbl [3]>

You can also vary the functions called with the use of do.call().

df <- tribble(
   ~rng,     ~params,
   "runif",  list(n = 10), 
   "rnorm",  list(n = 20),
   "rpois",  list(n = 10, lambda = 5),
) %>%
  rowwise()

df %>% 
  mutate(data = list(do.call(rng, params)))
#> # A tibble: 3 × 3
#> # Rowwise: 
#>   rng   params           data      
#>   <chr> <list>           <list>    
#> 1 runif <named list [1]> <dbl [10]>
#> 2 rnorm <named list [1]> <dbl [20]>
#> 3 rpois <named list [2]> <int [10]>