R Data Structures In Depth
Matrices and Arrays
Creating and Indexing Matrices
A matrix is a 2-dimensional vector — all elements same type, filled column-by-column by default.
m <- matrix(1:12, nrow=3, ncol=4)
# [,1] [,2] [,3] [,4]
# [1,] 1 4 7 10
# [2,] 2 5 8 11
# [3,] 3 6 9 12
# Access [row, col] — both can be vectors
m[2, 3] # 8
m[1, ] # row 1: 1 4 7 10
m[, 2] # col 2: 4 5 6
m[1:2, 3:4] # submatrix
# Matrix operations
t(m) # transpose
A <- matrix(c(1,2,3,4), 2,2)
solve(A) # inverse
det(A) # determinant = -2
A %*% t(A) # matrix multiply
# apply over rows(1) or cols(2)
apply(m, 1, sum) # row sums: 22 26 30
apply(m, 2, mean) # col means: 2 5 8 11
m <- matrix(1:16, 4, 4)
m[1:2, 3:4]
# [,1] [,2]
# [1,] 9 13
# [2,] 10 14
Lists and Data Frames
Lists — Flexible Containers
A list can hold any mix of types, including other lists. They're the backbone of model objects and API responses.
student <- list(
name = "Aarav Mehta",
grade = 11,
scores = c(92, 88, 95),
passed = TRUE
)
# Three equivalent access methods
student$name # "Aarav Mehta" ($ notation)
student[["grade"]] # 11 ([[ ]] by name)
student[[3]] # c(92,88,95) ([[ ]] by position)
# [ ] returns a list; [[ ]] returns the element
student[1] # list of length 1
student[[1]] # the actual element (character)
# Modify / add
student$school <- "Vidaara Academy"
student[["grade"]] <- 12
# Apply functions over lists
score_lists <- list(a=1:5, b=6:10, c=11:15)
lapply(score_lists, mean) # returns list: 3, 8, 13
sapply(score_lists, mean) # returns named vector
info <- list(student=list(name='Kavya', scores=c(88,92)), year=2024). Access Kavya's first score.
info <- list(student=list(name='Kavya', scores=c(88,92)), year=2024)
info$student$scores[1] # 88
# or
info[["student"]][["scores"]][1] # 88
list[1] and list[[1]] in R?Data Frames — Tabular Data
A data frame is R's primary structure for tabular data — a list of equal-length vectors treated as columns.
df <- data.frame(
student = c("Aarav","Kavya","Rohan","Ananya"),
score = c(92, 88, 95, 78),
grade = c("A","B","A","C"),
stringsAsFactors = FALSE # modern default
)
# Inspect
nrow(df); ncol(df); dim(df) # 4 3 | 4 3
str(df) # structure and types
head(df, 2) # first 2 rows
glimpse(df) # tidyverse version of str()
# Access columns
df$score # numeric vector
df[["student"]] # same
# Filter rows
df[df$score >= 90, ] # students scoring 90+
subset(df, score >= 90 & grade == "A")
# Add column
df$passed <- df$score >= 80
# Tibbles (tidyverse data frames) — nicer printing
library(tibble)
tbl <- as_tibble(df)
df <- data.frame(
name = c('Aarav','Kavya','Rohan'),
math = c(92,88,75),
english = c(85,90,82),
science = c(88,85,91)
)
df$total <- df$math + df$english + df$science
df[which.max(df$total), 'name'] # 'Aarav' with 265
Factors and Dates
Factors for Categorical Data
Factors store categorical data as integer codes with a levels attribute. Essential for statistical models and ordered plots.
# Unordered factor
grade <- factor(c("A","B","A","C","B","A"))
levels(grade) # "A" "B" "C" (alphabetical default)
table(grade) # frequency table
nlevels(grade) # 3
# Ordered factor (for comparisons)
size <- factor(c("Small","Large","Medium","Large"),
levels=c("Small","Medium","Large"),
ordered=TRUE)
size[1] < size[2] # TRUE: Small < Large
# forcats package for manipulation
library(forcats)
fct_count(grade) # frequency table as tibble
fct_relevel(grade, "C","B","A") # reorder levels
fct_lump(grade, n=2) # keep top 2, lump rest as 'Other'
fct_reorder(grade, score, mean) # reorder by mean of another var
experience with levels 'Junior','Senior','Lead' appearing in wrong order in a ggplot2 bar chart. Fix it.
experience <- factor(c('Senior','Junior','Lead','Junior','Senior'))
# Reorder for logical progression:
experience <- fct_relevel(experience, 'Junior','Senior','Lead')
levels(experience) # 'Junior' 'Senior' 'Lead'
# Now ggplot2 bars appear in this order