Project 1: Cyclistic bike-share analysis

Step 1: Install these packages in the console

install.packages(“tidyverse”)

install.packages(“lubridate”)

install.packages(“ggplot2”)

Step 2: Load installed packages using library()

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.1      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(lubridate)

## 
## Attaching package: 'lubridate'
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(ggplot2)
library(dplyr)

Step 3: Read 12 csv files representing 12 months of trips data, and creates 12 dataframes to store the read data

dec_21 <- read_csv(“202112.csv”)

jan_22 <- read_csv(“202201.csv”)

feb_22 <- read_csv(“202202.csv”)

mar_22 <- read_csv(“202203.csv”)

apr_22 <- read_csv(“202204.csv”)

may_22 <- read_csv(“202205.csv”)

jun_22 <- read_csv(“202206.csv”)

jul_22 <- read_csv(“202207.csv”)

aug_22 <- read_csv(“202208.csv”)

sep_22 <- read_csv(“202209.csv”)

oct_22 <- read_csv(“202210.csv”)

nov_22 <- read_csv(“202211.csv”)

dec_21 <- read_csv("202112.csv")

## Rows: 247540 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

jan_22 <- read_csv("202201.csv")

## Rows: 103770 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

feb_22 <- read_csv("202202.csv")

## Rows: 115609 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

mar_22 <- read_csv("202203.csv")

## Rows: 284042 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

apr_22 <- read_csv("202204.csv")

## Rows: 371249 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

may_22 <- read_csv("202205.csv")

## Rows: 634858 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

jun_22 <- read_csv("202206.csv")

## Rows: 769204 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

jul_22 <- read_csv("202207.csv")

## Rows: 823488 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

aug_22 <- read_csv("202208.csv")

## Rows: 785932 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

sep_22 <- read_csv("202209.csv")

## Rows: 701339 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

oct_22 <- read_csv("202210.csv")

## Rows: 558685 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

nov_22 <- read_csv("202211.csv")

## Rows: 337735 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl  (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Step 4: Merge 12 dataframes created above into one dataframe called “all_trips”

all_trips <- bind_rows(dec_21,jan_22,feb_22,mar_22,apr_22,may_22,jun_22,jul_22,aug_22,sep_22,oct_22,nov_22)

Step 5: De-select columns from dataframe “all_trips” that will not be used for this analysis

all_trips <- all_trips %>% 
  select(-c(start_lat,start_lng,end_lat,end_lng))

Step 6: Create a new column ‘date’ in dataframe “all_trips” and fill the column with calculated values using as.Date() function to find out the date from column ‘started_at’ format yyyy-mm-dd:

all_trips$date <- as.Date(all_trips$started_at)

Step 7: Create a new column ‘day_of_week’ in dataframe “all_trips” and fill the column with calculated values using as.Date to find out the day of the week of the ‘date’ column:

all_trips$day_of_week <- format(as.Date(all_trips$date), "%A")

Step 8: Convert ‘day_of_week’ into numeric values (Monday = 1; Sunday = 7):

all_trips$day_of_week_num <- format(as.Date(all_trips$date), "%u")

Step 9: Create a column ‘ride_length’ and fill it with calculated values (the time difference between columns ‘started_at’ & ‘ended_at’):

(difftime() will produce value in seconds (num))

all_trips$ride_length <- difftime(all_trips$ended_at, all_trips$started_at)

Step 10: Convert datatype from char to num for column ‘ride_length’:

all_trips$ride_length <- as.numeric(all_trips$ride_length)

Step 11: Eliminate rows with negative ‘ride_length’ value, and unwanted values in ‘rideable_type’ other than classic_bike and electric_bike

all_trips <- all_trips[!(all_trips$ride_length < 0),]
all_trips <- filter(all_trips, all_trips$rideable_type == "classic_bike" | all_trips$rideable_type == "electric_bike")

Step 12: Inspect column ‘ride_length’ using summary() function:

summary(all_trips$ride_length)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   344.0   603.0   967.2  1067.0 93596.0

Step 13: Inspect dataframe ‘all_trips’

View the structure

str(all_trips)

## tibble [5,552,874 × 13] (S3: tbl_df/tbl/data.frame)
##  $ ride_id           : chr [1:5552874] "46F8167220E4431F" "73A77762838B32FD" "4CF42452054F59C5" "3278BA87BF698339" ...
##  $ rideable_type     : chr [1:5552874] "electric_bike" "electric_bike" "electric_bike" "classic_bike" ...
##  $ started_at        : POSIXct[1:5552874], format: "2021-12-07 15:06:07" "2021-12-11 03:43:29" ...
##  $ ended_at          : POSIXct[1:5552874], format: "2021-12-07 15:13:42" "2021-12-11 04:10:23" ...
##  $ start_station_name: chr [1:5552874] "Laflin St & Cullerton St" "LaSalle Dr & Huron St" "Halsted St & North Branch St" "Halsted St & North Branch St" ...
##  $ start_station_id  : chr [1:5552874] "13307" "KP1705001026" "KA1504000117" "KA1504000117" ...
##  $ end_station_name  : chr [1:5552874] "Morgan St & Polk St" "Clarendon Ave & Leland Ave" "Broadway & Barry Ave" "LaSalle Dr & Huron St" ...
##  $ end_station_id    : chr [1:5552874] "TA1307000130" "TA1307000119" "13137" "KP1705001026" ...
##  $ member_casual     : chr [1:5552874] "member" "casual" "member" "member" ...
##  $ date              : Date[1:5552874], format: "2021-12-07" "2021-12-11" ...
##  $ day_of_week       : chr [1:5552874] "Tuesday" "Saturday" "Wednesday" "Sunday" ...
##  $ day_of_week_num   : chr [1:5552874] "2" "6" "3" "7" ...
##  $ ride_length       : num [1:5552874] 455 1614 766 883 1216 ...

View all columns’ name

colnames(all_trips)

##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "member_casual"     
## [10] "date"               "day_of_week"        "day_of_week_num"   
## [13] "ride_length"

View total number of rows

nrow(all_trips)

## [1] 5552874

Inspect member_casual column

table(all_trips$member_casual)

## 
##  casual  member 
## 2166344 3386530

Inspect rideable_type column

table(all_trips$rideable_type)

## 
##  classic_bike electric_bike 
##       2628471       2924403

Inspect day_of_week column

table(all_trips$day_of_week)

## 
##    Friday    Monday  Saturday    Sunday  Thursday   Tuesday Wednesday 
##    793459    734490    880459    746395    833884    764844    799343

table(all_trips$day_of_week_num)

## 
##      1      2      3      4      5      6      7 
## 734490 764844 799343 833884 793459 880459 746395

Step 14: Analysis

all_trips$day_of_week <- ordered(all_trips$day_of_week, level =c("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"))

aggregate(all_trips$ride_length ~ all_trips$member_casual + all_trips$day_of_week, FUN = mean)

##    all_trips$member_casual all_trips$day_of_week all_trips$ride_length
## 1                   casual                Monday             1295.4847
## 2                   member                Monday              736.4146
## 3                   casual               Tuesday             1172.8889
## 4                   member               Tuesday              728.8662
## 5                   casual             Wednesday             1129.5208
## 6                   member             Wednesday              723.9585
## 7                   casual              Thursday             1160.6411
## 8                   member              Thursday              737.9848
## 9                   casual                Friday             1230.1663
## 10                  member                Friday              750.0025
## 11                  casual              Saturday             1427.1772
## 12                  member              Saturday              849.0422
## 13                  casual                Sunday             1460.6215
## 14                  member                Sunday              842.8941

aggregate(all_trips$ride_length ~ all_trips$member_casual + all_trips$day_of_week + all_trips$rideable_type, FUN = mean)

##    all_trips$member_casual all_trips$day_of_week all_trips$rideable_type
## 1                   casual                Monday            classic_bike
## 2                   member                Monday            classic_bike
## 3                   casual               Tuesday            classic_bike
## 4                   member               Tuesday            classic_bike
## 5                   casual             Wednesday            classic_bike
## 6                   member             Wednesday            classic_bike
## 7                   casual              Thursday            classic_bike
## 8                   member              Thursday            classic_bike
## 9                   casual                Friday            classic_bike
## 10                  member                Friday            classic_bike
## 11                  casual              Saturday            classic_bike
## 12                  member              Saturday            classic_bike
## 13                  casual                Sunday            classic_bike
## 14                  member                Sunday            classic_bike
## 15                  casual                Monday           electric_bike
## 16                  member                Monday           electric_bike
## 17                  casual               Tuesday           electric_bike
## 18                  member               Tuesday           electric_bike
## 19                  casual             Wednesday           electric_bike
## 20                  member             Wednesday           electric_bike
## 21                  casual              Thursday           electric_bike
## 22                  member              Thursday           electric_bike
## 23                  casual                Friday           electric_bike
## 24                  member                Friday           electric_bike
## 25                  casual              Saturday           electric_bike
## 26                  member              Saturday           electric_bike
## 27                  casual                Sunday           electric_bike
## 28                  member                Sunday           electric_bike
##    all_trips$ride_length
## 1              1753.9444
## 2               803.7589
## 3              1636.5008
## 4               798.7257
## 5              1567.9993
## 6               788.8052
## 7              1588.5869
## 8               807.2909
## 9              1670.0205
## 10              817.3052
## 11             1823.6458
## 12              933.7907
## 13             1896.9662
## 14              922.0084
## 15              979.6059
## 16              662.6166
## 17              875.3609
## 18              653.6399
## 19              859.3384
## 20              657.1305
## 21              885.2522
## 22              668.1723
## 23              944.2425
## 24              683.9637
## 25             1094.9074
## 26              759.7843
## 27             1105.9543
## 28              757.3954

all_trips %>%
  group_by(member_casual, rideable_type) %>%
    summarise(number_of_rides = n())

## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

## # A tibble: 4 × 3
## # Groups:   member_casual [2]
##   member_casual rideable_type number_of_rides
##   <chr>         <chr>                   <int>
## 1 casual        classic_bike           898597
## 2 casual        electric_bike         1267747
## 3 member        classic_bike          1729874
## 4 member        electric_bike         1656656

all_trips %>%
  group_by(member_casual) %>%
    summarise(number_of_rides = n(), average_duration = mean(ride_length))

## # A tibble: 2 × 3
##   member_casual number_of_rides average_duration
##   <chr>                   <int>            <dbl>
## 1 casual                2166344            1287.
## 2 member                3386530             763.

Step 15: Prepare for visualization

week_day_order <- c('Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday')

trips_by_month <- data.frame (
  month_year = c("Dec 21", "Jan 22", "Feb 22", "Mar 22", "Apr 22", "May 22", "Jun 22", "Jul 22", "Aug 22", "Sep 22", "Oct 22", "Nov 22"),
  number_of_trips = c(nrow(dec_21), nrow(jan_22), nrow(feb_22), nrow(mar_22), nrow(apr_22), nrow(may_22), nrow(jun_22), nrow(jul_22), nrow(aug_22), nrow(sep_22), nrow(oct_22), nrow(nov_22))
)
trips_by_month$month_year <- factor(trips_by_month$month_year, levels = c("Dec 21", "Jan 22", "Feb 22", "Mar 22", "Apr 22", "May 22", "Jun 22", "Jul 22", "Aug 22", "Sep 22", "Oct 22", "Nov 22"))

Step 16: Visualization

Visual 15A: Visualize the Number of Rides by Day of Week and Rider Type

all_trips %>%
    group_by(member_casual, day_of_week) %>%
      summarise(number_of_rides = n()) %>%
        arrange(member_casual, day_of_week) %>%
          ggplot(aes(x = factor(day_of_week, level = week_day_order), y = number_of_rides, fill= member_casual)) + geom_col(position = "dodge") + ggtitle("Number of rides by day of week and rider type") + xlab("Day of Week") + ylab("Number of Rides") + scale_fill_discrete(name = "Rider Type") + scale_y_continuous(labels = scales::comma)

## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

Visual 15B: Visualize the Average Duration by Day of Week and Rider Type

all_trips %>%
    group_by(member_casual, day_of_week) %>%
      summarise(average_duration = mean(ride_length)) %>%
        arrange(member_casual, day_of_week) %>%
          ggplot(aes(x = factor(day_of_week, level = week_day_order), y = average_duration, fill = member_casual)) + geom_col(position = "dodge") + ggtitle("Average duration by day of week and rider type") + xlab("Day of Week") + ylab("Average Duration") + scale_fill_discrete(name = "Rider Type")

## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

Visual 15C: Visualize the Number of Rides by Rider Type and Bike Type

all_trips %>%
    group_by(rideable_type, member_casual) %>%
      summarise(number_of_rides = n()) %>%
        arrange(rideable_type, member_casual) %>%
          ggplot(mapping=aes(x=member_casual, y= number_of_rides, fill=rideable_type)) + geom_bar(stat = "identity") + ggtitle("Number of rides by rider type and bike type") + xlab("Rider Type") + ylab("Number of Rides") + scale_fill_discrete(name = "Type of Bike") + scale_y_continuous(labels = scales::comma)

## `summarise()` has grouped output by 'rideable_type'. You can override using the
## `.groups` argument.

Visual 15D: Visualize the total amount of rides for each month

ggplot(trips_by_month, aes(x=month_year, y=number_of_trips, fill=month_year)) + geom_bar(stat = "identity") + scale_y_continuous(labels = scales::comma) + scale_fill_manual(breaks = c("Dec 21", "Jan 22", "Feb 22", "Mar 22", "Apr 22", "May 22", "Jun 22", "Jul 22", "Aug 22", "Sep 22", "Oct 22", "Nov 22"), values = c("#2fcbfa", "#362ffa", "#2f5ffa", "#2ffae6", "#2ffa44", "#cefa2f", "#fa9b2f", "#fa582f", "#fa7a2f", "#face2f", "#a2fa2f", "#2ffa9b")) + ggtitle("Total number of trip records by month") + xlab("Month, Year") + ylab("Number of Trips") + theme(legend.position = "none")

END OF ANALYSIS

Export dataset ‘all_trips’ to CSV file for Tableau:

write.csv(all_trips, “~/Project 1 Cyclistic bike-share analysis/all_trips.csv”, row.names=FALSE)

counts <- aggregate(all_trips$ride_length~all_trips$member_casual+all_trips$day_of_week, FUN = mean)

write.csv(counts, “~/Project 1 Cyclistic bike-share analysis/avg_ride_length.csv”, row.names=FALSE)