library(readr)
<- read_csv("data.csv") df
Data in Tables (reference card)
Reading data
CSV data
TSV data
library(readr)
<- read_tsv("data.tsv") df
Properly reading in null values
By default read_csv
and read_tsv
read in empty cells and cells with "NA"
as NA
values. You can change which values are read in as NA
using the na
argument:
library(readr)
# Only read "-999" as NA
<- read_tsv("data.tsv", na = c("-999"))
df
# Read in empty values, "NA", & "-999" as NA
<- read_tsv("data.tsv", na = c("", "NA", "-999")) df
Basic dplyr
library(dplyr)
Select columns (select)
<- select(df, col_name_1, col_name_2) df_with_selected_columns
Add a new columns (mutate)
<- mutate(df, col_name_3 = col_name_1 * col_name_2) df_with_new_column
Sort rows (arrange)
Sort ascending by col_name_1
and then descending by col_name_2
:
<- arrange(df, col_name_1, desc(col_name_2)) sorted_df
Filter out rows not matching conditions (filter)
One condition
<- filter(df, col_name_2 == "A")
filtered_df <- filter(df, col_name_1 > 5) filtered_df
More than one condition (and)
Keep rows where col_name_2
is "A"
and col_name_1
is greater than 5:
<- filter(df, col_name_2 == "A", col_name_1 > 5) filtered_df
More than one condition (or)
Keep only rows where col_name_2
is either "A"
or "B"
:
<- filter(df, col_name_2 == "A" | col_name_2 == "B")) filtered_df
Remove null values (NA
)
To create a table without rows that have NA
’s in any column:
<- drop_na(df) df_no_na
To only drop rows that have NA
’s in specific columns also list the names of those columns:
<- drop_na(df, col_name_to_drop_if_na) df_no_na
Combining data manipulations
Intermediate variables
Store each step in a new variable and then use that variable name in the next step.
<- select(df, col_name_1, col_name2)
selected_df <- filter(selected_df, col_name_1 == "A")
filtered_selected_df <- drop_na(filtered_selected_df) no_na_filtered_selected_df
Pipes
Pipes (|>
) pass the output from command on the left of the pipe as the first argument to the function on the right of the pipe. So instead of storing results in intermediate variables we can pass them through pipes.
<- df |>
no_na_filtered_selected_df select(col_name_1, col_name2) |>
filter(col_name_1 == "A") |>
drop_na()