Data Wrangling with dplyr
R Programming & Data Analytics / Data Wrangling with dplyr

Data Wrangling with dplyr

Intermediate 15 hrs 4 Concepts
M1

The Six Core Verbs

Concept 1

filter() and select()

filter() keeps rows matching a condition. select() chooses columns.

R
library(dplyr)

students <- tibble(
  name    = c('Aarav','Kavya','Rohan','Ananya','Vivaan'),
  subject = c('Math','Math','Science','Math','Science'),
  score   = c(92, 88, 75, 95, 82),
  year    = c(11, 11, 10, 11, 10)
)

# filter() — multiple conditions use & (AND) or | (OR)
students |> filter(score >= 90)
students |> filter(subject == 'Math', year == 11)   # AND
students |> filter(score > 80 | year == 10)          # OR
students |> filter(name %in% c('Aarav','Kavya'))     # %in%
students |> filter(!is.na(score))                    # not NA

# select() — choose, drop, or rename columns
students |> select(name, score)
students |> select(-year)                  # drop year
students |> select(starts_with('s'))       # helper: starts_with
students |> select(where(is.numeric))      # all numeric cols
students |> select(student=name, marks=score)  # rename while selecting
R
# filter() — keep rows matching a condition
library(dplyr)
high_mpg <- filter(mpg, hwy > 30, cyl == 4)
select(high_mpg, manufacturer, model, hwy, cyl)
Data Frame Output
manufacturermodelhwycyl
hondacivic364
hondacivic364
toyotacorolla354
volkswagenjetta344
toyotacorolla344
volkswagennew beetle444
Solved Examples
Example 1 From students data, select only Math students in year 11 and show only their name and score.
R
students |>
  filter(subject == 'Math', year == 11) |>
  select(name, score)
# Aarav  92
# Kavya  88
# Ananya 95
Example 2 Keep only rows where subject is either 'Math' or 'Physics'.
R
df |> filter(subject %in% c('Math', 'Physics'))
# Equivalent to:
df |> filter(subject == 'Math' | subject == 'Physics')
# %in% is cleaner for 3+ values
Self-Assessment (3 questions)
Q1. Which dplyr verb removes columns?
Q2. What does filter(df, !is.na(score)) do?
Q3. Which select() helper picks columns that match a pattern?
Concept 2

mutate(), arrange(), summarise(), group_by()

mutate() adds/transforms columns. arrange() sorts. summarise() collapses to aggregated rows. group_by() makes operations per-group.

R
# mutate() — add or transform columns
students |> mutate(
  grade     = if_else(score >= 90, 'A', if_else(score >= 80, 'B', 'C')),
  score_pct = score / 100,
  rank      = rank(desc(score))   # rank within mutate
)

# arrange() — sort ascending; desc() for descending
students |> arrange(desc(score), name)   # score descending, name alphabetical

# summarise() — aggregate to one row
students |> summarise(
  n          = n(),
  mean_score = mean(score),
  max_score  = max(score),
  pass_rate  = mean(score >= 80)
)

# group_by() + summarise() — aggregate per group
students |>
  group_by(subject, year) |>
  summarise(
    mean_score = mean(score),
    n          = n(),
    .groups    = 'drop'   # always add this to avoid warnings
  )

# group_by() + mutate() — per-group transform (keeps all rows!)
students |>
  group_by(subject) |>
  mutate(subject_rank = rank(desc(score)))
R
# group_by + summarise — mean MPG per car class
mpg %>%
  group_by(class) %>%
  summarise(
    avg_hwy  = round(mean(hwy), 1),
    avg_cty  = round(mean(cty), 1),
    count    = n()
  ) %>%
  arrange(desc(avg_hwy))
Data Frame Output
classavg_hwyavg_ctycount
subcompact28.120.435
compact28.320.147
midsize27.318.841
minivan22.415.811
2seater24.815.45
pickup16.913.033
suv18.113.562
R
# mutate() — add derived columns
mpg %>%
  mutate(
    efficiency_ratio = round(hwy / cty, 2),
    size_class       = if_else(displ > 3.5, "Large", "Small")
  ) %>%
  select(model, displ, cty, hwy, efficiency_ratio, size_class) %>%
  head(6)
Chart Output
Solved Examples
Example 1 For each subject, calculate the number of students, mean score, and percentage scoring >= 80.
R
students |>
  group_by(subject) |>
  summarise(
    n         = n(),
    mean_score = round(mean(score), 1),
    pass_rate  = paste0(round(mean(score >= 80)*100), '%'),
    .groups   = 'drop'
  )
Example 2 Add a column showing each student's rank within their subject, ordered by score descending.
R
students |>
  group_by(subject) |>
  mutate(rank = rank(desc(score))) |>
  arrange(subject, rank)
Self-Assessment (2 questions)
Q1. What is the difference between summarise() and mutate() after group_by()?
Q2. Which argument should you add to summarise() to suppress grouping warnings?
M2

Joins and Advanced Operations

Concept 3

Joining Datasets

dplyr provides SQL-style joins to combine two tables by matching key columns.

R
students <- tibble(id=1:4, name=c('Aarav','Kavya','Rohan','Ananya'))
enrolled <- tibble(id=c(1,2,4,5), course=c('R','Python','SQL','Julia'))

# inner_join — only rows with match in BOTH tables
inner_join(students, enrolled, by='id')
# Aarav-R, Kavya-Python, Ananya-SQL (Rohan dropped, 5 dropped)

# left_join — ALL rows from left; NA if no match on right
left_join(students, enrolled, by='id')
# Aarav-R, Kavya-Python, Rohan-NA, Ananya-SQL

# right_join — ALL rows from right
right_join(students, enrolled, by='id')

# full_join — ALL rows from both
full_join(students, enrolled, by='id')

# anti_join — rows in left NOT in right (non-enrolled students)
anti_join(students, enrolled, by='id')
# Rohan only

# semi_join — rows in left that HAVE a match in right (no extra cols)
semi_join(students, enrolled, by='id')
# Aarav, Kavya, Ananya (no extra columns from enrolled)

# Different column names
left_join(students, enrolled, by=c('id'='student_id'))
R
# inner_join — combine orders with customer info
orders     <- data.frame(id=c(1,2,3,4), cust_id=c(10,11,10,12), amount=c(250,180,320,95))
customers  <- data.frame(cust_id=c(10,11,13), name=c("Aarav","Kavya","Rohan"), city=c("Mumbai","Delhi","Pune"))
inner_join(orders, customers, by="cust_id")
Data Frame Output
idcust_idamountnamecity
110250AaravMumbai
310320AaravMumbai
211180KavyaDelhi
Solved Examples
Example 1 You have a products table and sales table. Find products that have never been sold.
R
anti_join(products, sales, by='product_id')
# Returns products with no matching product_id in sales
# Perfect for finding 'dead stock'
Self-Assessment (2 questions)
Q1. Which join keeps all rows from the LEFT table?
Q2. What does anti_join() return?
Concept 4

across() — Apply to Multiple Columns

across() inside mutate() or summarise() applies a function to multiple columns at once, eliminating repetition.

R
library(dplyr)

# Standardise ALL numeric columns
df |> mutate(across(where(is.numeric), scale))

# Round specific columns to 2 decimal places
df |> mutate(across(c(math, english, science), round, digits=2))

# Multiple functions at once
df |> summarise(across(
  c(math, english, science),
  list(mean=mean, sd=sd, max=max),
  na.rm=TRUE
))
# Creates: math_mean, math_sd, math_max, english_mean, etc.

# Custom anonymous function
df |> mutate(across(where(is.character), ~ toupper(trimws(.))))

# rename_with() — transform column names consistently
df |> rename_with(tolower)              # all to lowercase
df |> rename_with(~ gsub(' ','_',.))
Solved Examples
Example 1 Apply percent_rank() to all numeric columns simultaneously.
R
library(dplyr)
df |> mutate(across(where(is.numeric), percent_rank))
# percent_rank() gives 0-1 rank for each value
# Applied to ALL numeric columns in one line
Self-Assessment (1 questions)
Q1. What does across(where(is.numeric), mean) do inside summarise()?
Data Import & Export Data Reshaping with tidyr