1. Setting up your RStudio workspace

Preparing the environment and loading data files

🏠️Setting the working directory
📦️Installing and loading packages
🗒️Importing data: Assigning names and reading files
🏷️Labeling variables
🗂️Factorizing categorical variables
⭐️Putting it all together

⬇️

Click here to download the example CSV file, be sure to save as "dogs.csv"

🏠️

1. Setting the working directory

Function "setwd"
Parameter: The file path as a character vector
Purpose: Maps your directory to the specified path

# Syntax usage, any text in bold is user input
setwd("path/here/with/forward/slashes")

# Set working directory
setwd("C:/Users/lianjarzbecker/Downloads")


# Verify working directory
getwd()

📦️

2. Installing and loading packages

Function "install_and_load"
Parameter: A list of package names as a character vector
Purpose:
- Checks if each package is installed
- Installs any missing packages
- Loads all specified packages

# Synatx usage
packages <- c("package1", "package2")

# Package list
packages <- c("readxl", "expss")

### No need to edit anything below this triple comment symbol
# Function for installing missing packages
install_and_load <- function(pkg) {
  if (!require(pkg, character.only = TRUE)) {
    install.packages(pkg, repos = "http://cran.us.r-project.org")
    library(pkg, character.only = TRUE)
  }
}

# Install and load packages
invisible(lapply(packages, install_and_load))
### No need to edit anything above this triple comment symbol

🗒️

3. Importing data: Assigning names and reading files

Variable names can be assigned directly without calling a function by using the assignment operator "<-"
- Parameter: Variable name
- Purpose: Stores variable and allows for referencing
When reading a file, function used depends on file type
- Parameter: Fle name as a character vector, including the file extension
- Purpose: Reads the file (assuming it's in your working directory) and stores it as a data frame under your assigned variable name

# Syntax usage
data_frame_name <- read.csv("filename.csv")
data_frame_name <- read_xlsx("filename.xlsx")
data_frame_name <- read.table("filename.txt")

Function "read.csv"
Parameter: File name
Purpose: Read CSV files

Function "read_xlsx" (requires "readxl" package)
Parameter: File name
Purpose: Read Excel files

Function "read.table"
Parameter: File name
Purpose: Read text files

⬇️

Click here to download the xlsx version and click here to download the txt version.

# Backward arrow names & saves as data frame
dogs <- read.csv("dogs.csv")


# Handling missing data: empty cells are filled with "NA"
dogs <- read.csv("dogs.csv", na.strings = c("","NA"))


# Verify structure & first few rows of data
str(dogs)
head(dogs)


# Same data, just in different file formats
# Reads both .xlsx and .xls files
library(readxl)
dogs_excel <- read_xlsx("dogs.xlsx")


# Function "read.table" assumes space delimiter & no header
# need to indicate the separator is a tab and there is a header
dogs3_text <- read.table("dogs.txt", sep = "\t", header = T)

🏷️

4. Labeling variables

Function "apply_labels" (requires "expss" package)
Parameter: Variable name as character vector
Purpose: Enhances readability and interpretability of your data

# Syntax usage
data = apply_labels(data,
variable1 = "Variable 1",
variable2 = "Variable 2")

# Give variables labels. Might seem redundant but think of future you!
library(expss)
dogs = apply_labels(dogs,
                    breed = "Breed",
                    group = "Group",
                    height = "Height (in)",
                    weight = "Weight (lb)",
                    life_expect = "Life span (yr)",
                    affection = "Affectionate",
                    kids = "Good with kids",
                    dogs = "Good with other dogs",
                    shedding = "Shedding level",
                    grooming = "Coat grooming frequency",
                    drooling = "Drooling level",
                    coatT = "Coat type",
                    coatL = "Coat length",
                    strangers = "Stranger openness",
                    playful = "Playfulness",
                    protec = "Protectiveness",
                    adapt = "Adaptability",
                    train = "Trainability",
                    energy = "Energy level",
                    bark = "Barking level",
                    stim = "Mental stimulation needs")

# Verify labels have been correctly assigned
str(dogs)

🗂️

5. Factorizing categorical variables

Function "factor" and "as.factor"
Parameter:
- Character or numeric column in your data frame
- Column in the data frame referenced by "$"
- Specify levels (raw data values)
- Outline labels (new data names)
Purpose: Creates a new column with the factorized data

# Syntax usage
data$new_column_name <- as.factor(data$column_name)
data$new_column_name <- factor(data$column_name,
levels = c(1, 2, 3),
labels = c("One", "Two", "Three"))

6. Putting it all together

The script below provides a comprehensive workflow for preparing your R environment, loading data, and preparing it for further analysis or visualization tasks
Adjust the file path to match the location and name of your actual CSV file

# RStudio workspace setup
setwd("C:/Users/lianjarzbecker/Downloads")

# List of packages to install and load
packages <- c("expss")

# Function to install and load packages if not already installed
install_and_load <- function(pkg) {
  if (!require(pkg, character.only = TRUE)) {
    install.packages(pkg, repos = "http://cran.us.r-project.org")
    library(pkg, character.only = TRUE)
  }
}

# Install and load packages
invisible(lapply(packages, install_and_load))

# Read data from CSV file and label variables
dogs <- read.csv("dogs.csv", na.strings = c("","NA"))
dogs = apply_labels(dogs,
                    breed = "Breed",
                    group = "Group",
                    height = "Height (in)",
                    weight = "Weight (lb)",
                    life_expect = "Life span (yr)",
                    affection = "Affectionate",
                    kids = "Good with kids",
                    dogs = "Good with other dogs",
                    shedding = "Shedding level",
                    grooming = "Coat grooming frequency",
                    drooling = "Drooling level",
                    coatT = "Coat type",
                    coatL = "Coat length",
                    strangers = "Stranger openness",
                    playful = "Playfulness",
                    protec = "Protectiveness",
                    adapt = "Adaptability",
                    train = "Trainability",
                    energy = "Energy level",
                    bark = "Barking level",
                    stim = "Mental stimulation needs")

# Factorize variables
dogs$groupF <- as.factor(dogs$group)
dogs$barkF <- factor(dogs$bark,
                     levels = c(1, 2, 3, 4, 5),
                     labels = c("Only to alert", "Rarely",
                                "Sometimes", "Often", "Very vocal"))

# Optional: View the structure and stats summary of the data frame
str(dogs)
summary(dogs)

Up next: Scatterplots

Lian J. Arzbecker, Ph.D.

Lian Arzbecker