Data Wrangling with dplyr
The Six Core Verbs
filter() and select()
filter() keeps rows matching a condition. select() chooses columns.
library(dplyr)
students <- tibble(
name = c('Aarav','Kavya','Rohan','Ananya','Vivaan'),
subject = c('Math','Math','Science','Math','Science'),
score = c(92, 88, 75, 95, 82),
year = c(11, 11, 10, 11, 10)
)
# filter() — multiple conditions use & (AND) or | (OR)
students |> filter(score >= 90)
students |> filter(subject == 'Math', year == 11) # AND
students |> filter(score > 80 | year == 10) # OR
students |> filter(name %in% c('Aarav','Kavya')) # %in%
students |> filter(!is.na(score)) # not NA
# select() — choose, drop, or rename columns
students |> select(name, score)
students |> select(-year) # drop year
students |> select(starts_with('s')) # helper: starts_with
students |> select(where(is.numeric)) # all numeric cols
students |> select(student=name, marks=score) # rename while selecting
# filter() — keep rows matching a condition
library(dplyr)
high_mpg <- filter(mpg, hwy > 30, cyl == 4)
select(high_mpg, manufacturer, model, hwy, cyl)
| manufacturer | model | hwy | cyl |
|---|---|---|---|
| honda | civic | 36 | 4 |
| honda | civic | 36 | 4 |
| toyota | corolla | 35 | 4 |
| volkswagen | jetta | 34 | 4 |
| toyota | corolla | 34 | 4 |
| volkswagen | new beetle | 44 | 4 |
students |>
filter(subject == 'Math', year == 11) |>
select(name, score)
# Aarav 92
# Kavya 88
# Ananya 95
df |> filter(subject %in% c('Math', 'Physics'))
# Equivalent to:
df |> filter(subject == 'Math' | subject == 'Physics')
# %in% is cleaner for 3+ values
filter(df, !is.na(score)) do?mutate(), arrange(), summarise(), group_by()
mutate() adds/transforms columns. arrange() sorts. summarise() collapses to aggregated rows. group_by() makes operations per-group.
# mutate() — add or transform columns
students |> mutate(
grade = if_else(score >= 90, 'A', if_else(score >= 80, 'B', 'C')),
score_pct = score / 100,
rank = rank(desc(score)) # rank within mutate
)
# arrange() — sort ascending; desc() for descending
students |> arrange(desc(score), name) # score descending, name alphabetical
# summarise() — aggregate to one row
students |> summarise(
n = n(),
mean_score = mean(score),
max_score = max(score),
pass_rate = mean(score >= 80)
)
# group_by() + summarise() — aggregate per group
students |>
group_by(subject, year) |>
summarise(
mean_score = mean(score),
n = n(),
.groups = 'drop' # always add this to avoid warnings
)
# group_by() + mutate() — per-group transform (keeps all rows!)
students |>
group_by(subject) |>
mutate(subject_rank = rank(desc(score)))
# group_by + summarise — mean MPG per car class
mpg %>%
group_by(class) %>%
summarise(
avg_hwy = round(mean(hwy), 1),
avg_cty = round(mean(cty), 1),
count = n()
) %>%
arrange(desc(avg_hwy))
| class | avg_hwy | avg_cty | count |
|---|---|---|---|
| subcompact | 28.1 | 20.4 | 35 |
| compact | 28.3 | 20.1 | 47 |
| midsize | 27.3 | 18.8 | 41 |
| minivan | 22.4 | 15.8 | 11 |
| 2seater | 24.8 | 15.4 | 5 |
| pickup | 16.9 | 13.0 | 33 |
| suv | 18.1 | 13.5 | 62 |
# mutate() — add derived columns
mpg %>%
mutate(
efficiency_ratio = round(hwy / cty, 2),
size_class = if_else(displ > 3.5, "Large", "Small")
) %>%
select(model, displ, cty, hwy, efficiency_ratio, size_class) %>%
head(6)
students |>
group_by(subject) |>
summarise(
n = n(),
mean_score = round(mean(score), 1),
pass_rate = paste0(round(mean(score >= 80)*100), '%'),
.groups = 'drop'
)
students |>
group_by(subject) |>
mutate(rank = rank(desc(score))) |>
arrange(subject, rank)
Joins and Advanced Operations
Joining Datasets
dplyr provides SQL-style joins to combine two tables by matching key columns.
students <- tibble(id=1:4, name=c('Aarav','Kavya','Rohan','Ananya'))
enrolled <- tibble(id=c(1,2,4,5), course=c('R','Python','SQL','Julia'))
# inner_join — only rows with match in BOTH tables
inner_join(students, enrolled, by='id')
# Aarav-R, Kavya-Python, Ananya-SQL (Rohan dropped, 5 dropped)
# left_join — ALL rows from left; NA if no match on right
left_join(students, enrolled, by='id')
# Aarav-R, Kavya-Python, Rohan-NA, Ananya-SQL
# right_join — ALL rows from right
right_join(students, enrolled, by='id')
# full_join — ALL rows from both
full_join(students, enrolled, by='id')
# anti_join — rows in left NOT in right (non-enrolled students)
anti_join(students, enrolled, by='id')
# Rohan only
# semi_join — rows in left that HAVE a match in right (no extra cols)
semi_join(students, enrolled, by='id')
# Aarav, Kavya, Ananya (no extra columns from enrolled)
# Different column names
left_join(students, enrolled, by=c('id'='student_id'))
# inner_join — combine orders with customer info
orders <- data.frame(id=c(1,2,3,4), cust_id=c(10,11,10,12), amount=c(250,180,320,95))
customers <- data.frame(cust_id=c(10,11,13), name=c("Aarav","Kavya","Rohan"), city=c("Mumbai","Delhi","Pune"))
inner_join(orders, customers, by="cust_id")
| id | cust_id | amount | name | city |
|---|---|---|---|---|
| 1 | 10 | 250 | Aarav | Mumbai |
| 3 | 10 | 320 | Aarav | Mumbai |
| 2 | 11 | 180 | Kavya | Delhi |
products table and sales table. Find products that have never been sold.
anti_join(products, sales, by='product_id')
# Returns products with no matching product_id in sales
# Perfect for finding 'dead stock'
across() — Apply to Multiple Columns
across() inside mutate() or summarise() applies a function to multiple columns at once, eliminating repetition.
library(dplyr)
# Standardise ALL numeric columns
df |> mutate(across(where(is.numeric), scale))
# Round specific columns to 2 decimal places
df |> mutate(across(c(math, english, science), round, digits=2))
# Multiple functions at once
df |> summarise(across(
c(math, english, science),
list(mean=mean, sd=sd, max=max),
na.rm=TRUE
))
# Creates: math_mean, math_sd, math_max, english_mean, etc.
# Custom anonymous function
df |> mutate(across(where(is.character), ~ toupper(trimws(.))))
# rename_with() — transform column names consistently
df |> rename_with(tolower) # all to lowercase
df |> rename_with(~ gsub(' ','_',.))
library(dplyr)
df |> mutate(across(where(is.numeric), percent_rank))
# percent_rank() gives 0-1 rank for each value
# Applied to ALL numeric columns in one line
across(where(is.numeric), mean) do inside summarise()?