install.packages(“tidyverse”)
install.packages(“lubridate”)
install.packages(“ggplot2”)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.1 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(lubridate)
##
## Attaching package: 'lubridate'
##
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2)
library(dplyr)
dec_21 <- read_csv(“202112.csv”)
jan_22 <- read_csv(“202201.csv”)
feb_22 <- read_csv(“202202.csv”)
mar_22 <- read_csv(“202203.csv”)
apr_22 <- read_csv(“202204.csv”)
may_22 <- read_csv(“202205.csv”)
jun_22 <- read_csv(“202206.csv”)
jul_22 <- read_csv(“202207.csv”)
aug_22 <- read_csv(“202208.csv”)
sep_22 <- read_csv(“202209.csv”)
oct_22 <- read_csv(“202210.csv”)
nov_22 <- read_csv(“202211.csv”)
dec_21 <- read_csv("202112.csv")
## Rows: 247540 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
jan_22 <- read_csv("202201.csv")
## Rows: 103770 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
feb_22 <- read_csv("202202.csv")
## Rows: 115609 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
mar_22 <- read_csv("202203.csv")
## Rows: 284042 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
apr_22 <- read_csv("202204.csv")
## Rows: 371249 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
may_22 <- read_csv("202205.csv")
## Rows: 634858 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
jun_22 <- read_csv("202206.csv")
## Rows: 769204 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
jul_22 <- read_csv("202207.csv")
## Rows: 823488 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
aug_22 <- read_csv("202208.csv")
## Rows: 785932 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sep_22 <- read_csv("202209.csv")
## Rows: 701339 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
oct_22 <- read_csv("202210.csv")
## Rows: 558685 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nov_22 <- read_csv("202211.csv")
## Rows: 337735 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
all_trips <- bind_rows(dec_21,jan_22,feb_22,mar_22,apr_22,may_22,jun_22,jul_22,aug_22,sep_22,oct_22,nov_22)
all_trips <- all_trips %>%
select(-c(start_lat,start_lng,end_lat,end_lng))
all_trips$date <- as.Date(all_trips$started_at)
all_trips$day_of_week <- format(as.Date(all_trips$date), "%A")
all_trips$day_of_week_num <- format(as.Date(all_trips$date), "%u")
(difftime() will produce value in seconds (num))
all_trips$ride_length <- difftime(all_trips$ended_at, all_trips$started_at)
all_trips$ride_length <- as.numeric(all_trips$ride_length)
all_trips <- all_trips[!(all_trips$ride_length < 0),]
all_trips <- filter(all_trips, all_trips$rideable_type == "classic_bike" | all_trips$rideable_type == "electric_bike")
summary(all_trips$ride_length)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 344.0 603.0 967.2 1067.0 93596.0
View the structure
str(all_trips)
## tibble [5,552,874 × 13] (S3: tbl_df/tbl/data.frame)
## $ ride_id : chr [1:5552874] "46F8167220E4431F" "73A77762838B32FD" "4CF42452054F59C5" "3278BA87BF698339" ...
## $ rideable_type : chr [1:5552874] "electric_bike" "electric_bike" "electric_bike" "classic_bike" ...
## $ started_at : POSIXct[1:5552874], format: "2021-12-07 15:06:07" "2021-12-11 03:43:29" ...
## $ ended_at : POSIXct[1:5552874], format: "2021-12-07 15:13:42" "2021-12-11 04:10:23" ...
## $ start_station_name: chr [1:5552874] "Laflin St & Cullerton St" "LaSalle Dr & Huron St" "Halsted St & North Branch St" "Halsted St & North Branch St" ...
## $ start_station_id : chr [1:5552874] "13307" "KP1705001026" "KA1504000117" "KA1504000117" ...
## $ end_station_name : chr [1:5552874] "Morgan St & Polk St" "Clarendon Ave & Leland Ave" "Broadway & Barry Ave" "LaSalle Dr & Huron St" ...
## $ end_station_id : chr [1:5552874] "TA1307000130" "TA1307000119" "13137" "KP1705001026" ...
## $ member_casual : chr [1:5552874] "member" "casual" "member" "member" ...
## $ date : Date[1:5552874], format: "2021-12-07" "2021-12-11" ...
## $ day_of_week : chr [1:5552874] "Tuesday" "Saturday" "Wednesday" "Sunday" ...
## $ day_of_week_num : chr [1:5552874] "2" "6" "3" "7" ...
## $ ride_length : num [1:5552874] 455 1614 766 883 1216 ...
View all columns’ name
colnames(all_trips)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "member_casual"
## [10] "date" "day_of_week" "day_of_week_num"
## [13] "ride_length"
View total number of rows
nrow(all_trips)
## [1] 5552874
Inspect member_casual column
table(all_trips$member_casual)
##
## casual member
## 2166344 3386530
Inspect rideable_type column
table(all_trips$rideable_type)
##
## classic_bike electric_bike
## 2628471 2924403
Inspect day_of_week column
table(all_trips$day_of_week)
##
## Friday Monday Saturday Sunday Thursday Tuesday Wednesday
## 793459 734490 880459 746395 833884 764844 799343
table(all_trips$day_of_week_num)
##
## 1 2 3 4 5 6 7
## 734490 764844 799343 833884 793459 880459 746395
all_trips$day_of_week <- ordered(all_trips$day_of_week, level =c("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"))
aggregate(all_trips$ride_length ~ all_trips$member_casual + all_trips$day_of_week, FUN = mean)
## all_trips$member_casual all_trips$day_of_week all_trips$ride_length
## 1 casual Monday 1295.4847
## 2 member Monday 736.4146
## 3 casual Tuesday 1172.8889
## 4 member Tuesday 728.8662
## 5 casual Wednesday 1129.5208
## 6 member Wednesday 723.9585
## 7 casual Thursday 1160.6411
## 8 member Thursday 737.9848
## 9 casual Friday 1230.1663
## 10 member Friday 750.0025
## 11 casual Saturday 1427.1772
## 12 member Saturday 849.0422
## 13 casual Sunday 1460.6215
## 14 member Sunday 842.8941
aggregate(all_trips$ride_length ~ all_trips$member_casual + all_trips$day_of_week + all_trips$rideable_type, FUN = mean)
## all_trips$member_casual all_trips$day_of_week all_trips$rideable_type
## 1 casual Monday classic_bike
## 2 member Monday classic_bike
## 3 casual Tuesday classic_bike
## 4 member Tuesday classic_bike
## 5 casual Wednesday classic_bike
## 6 member Wednesday classic_bike
## 7 casual Thursday classic_bike
## 8 member Thursday classic_bike
## 9 casual Friday classic_bike
## 10 member Friday classic_bike
## 11 casual Saturday classic_bike
## 12 member Saturday classic_bike
## 13 casual Sunday classic_bike
## 14 member Sunday classic_bike
## 15 casual Monday electric_bike
## 16 member Monday electric_bike
## 17 casual Tuesday electric_bike
## 18 member Tuesday electric_bike
## 19 casual Wednesday electric_bike
## 20 member Wednesday electric_bike
## 21 casual Thursday electric_bike
## 22 member Thursday electric_bike
## 23 casual Friday electric_bike
## 24 member Friday electric_bike
## 25 casual Saturday electric_bike
## 26 member Saturday electric_bike
## 27 casual Sunday electric_bike
## 28 member Sunday electric_bike
## all_trips$ride_length
## 1 1753.9444
## 2 803.7589
## 3 1636.5008
## 4 798.7257
## 5 1567.9993
## 6 788.8052
## 7 1588.5869
## 8 807.2909
## 9 1670.0205
## 10 817.3052
## 11 1823.6458
## 12 933.7907
## 13 1896.9662
## 14 922.0084
## 15 979.6059
## 16 662.6166
## 17 875.3609
## 18 653.6399
## 19 859.3384
## 20 657.1305
## 21 885.2522
## 22 668.1723
## 23 944.2425
## 24 683.9637
## 25 1094.9074
## 26 759.7843
## 27 1105.9543
## 28 757.3954
all_trips %>%
group_by(member_casual, rideable_type) %>%
summarise(number_of_rides = n())
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## # A tibble: 4 × 3
## # Groups: member_casual [2]
## member_casual rideable_type number_of_rides
## <chr> <chr> <int>
## 1 casual classic_bike 898597
## 2 casual electric_bike 1267747
## 3 member classic_bike 1729874
## 4 member electric_bike 1656656
all_trips %>%
group_by(member_casual) %>%
summarise(number_of_rides = n(), average_duration = mean(ride_length))
## # A tibble: 2 × 3
## member_casual number_of_rides average_duration
## <chr> <int> <dbl>
## 1 casual 2166344 1287.
## 2 member 3386530 763.
week_day_order <- c('Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday')
trips_by_month <- data.frame (
month_year = c("Dec 21", "Jan 22", "Feb 22", "Mar 22", "Apr 22", "May 22", "Jun 22", "Jul 22", "Aug 22", "Sep 22", "Oct 22", "Nov 22"),
number_of_trips = c(nrow(dec_21), nrow(jan_22), nrow(feb_22), nrow(mar_22), nrow(apr_22), nrow(may_22), nrow(jun_22), nrow(jul_22), nrow(aug_22), nrow(sep_22), nrow(oct_22), nrow(nov_22))
)
trips_by_month$month_year <- factor(trips_by_month$month_year, levels = c("Dec 21", "Jan 22", "Feb 22", "Mar 22", "Apr 22", "May 22", "Jun 22", "Jul 22", "Aug 22", "Sep 22", "Oct 22", "Nov 22"))
Visual 15A: Visualize the Number of Rides by Day of Week and Rider Type
all_trips %>%
group_by(member_casual, day_of_week) %>%
summarise(number_of_rides = n()) %>%
arrange(member_casual, day_of_week) %>%
ggplot(aes(x = factor(day_of_week, level = week_day_order), y = number_of_rides, fill= member_casual)) + geom_col(position = "dodge") + ggtitle("Number of rides by day of week and rider type") + xlab("Day of Week") + ylab("Number of Rides") + scale_fill_discrete(name = "Rider Type") + scale_y_continuous(labels = scales::comma)
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
Visual 15B: Visualize the Average Duration by Day of Week and Rider Type
all_trips %>%
group_by(member_casual, day_of_week) %>%
summarise(average_duration = mean(ride_length)) %>%
arrange(member_casual, day_of_week) %>%
ggplot(aes(x = factor(day_of_week, level = week_day_order), y = average_duration, fill = member_casual)) + geom_col(position = "dodge") + ggtitle("Average duration by day of week and rider type") + xlab("Day of Week") + ylab("Average Duration") + scale_fill_discrete(name = "Rider Type")
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
Visual 15C: Visualize the Number of Rides by Rider Type and Bike Type
all_trips %>%
group_by(rideable_type, member_casual) %>%
summarise(number_of_rides = n()) %>%
arrange(rideable_type, member_casual) %>%
ggplot(mapping=aes(x=member_casual, y= number_of_rides, fill=rideable_type)) + geom_bar(stat = "identity") + ggtitle("Number of rides by rider type and bike type") + xlab("Rider Type") + ylab("Number of Rides") + scale_fill_discrete(name = "Type of Bike") + scale_y_continuous(labels = scales::comma)
## `summarise()` has grouped output by 'rideable_type'. You can override using the
## `.groups` argument.
Visual 15D: Visualize the total amount of rides for each month
ggplot(trips_by_month, aes(x=month_year, y=number_of_trips, fill=month_year)) + geom_bar(stat = "identity") + scale_y_continuous(labels = scales::comma) + scale_fill_manual(breaks = c("Dec 21", "Jan 22", "Feb 22", "Mar 22", "Apr 22", "May 22", "Jun 22", "Jul 22", "Aug 22", "Sep 22", "Oct 22", "Nov 22"), values = c("#2fcbfa", "#362ffa", "#2f5ffa", "#2ffae6", "#2ffa44", "#cefa2f", "#fa9b2f", "#fa582f", "#fa7a2f", "#face2f", "#a2fa2f", "#2ffa9b")) + ggtitle("Total number of trip records by month") + xlab("Month, Year") + ylab("Number of Trips") + theme(legend.position = "none")
write.csv(all_trips, “~/Project 1 Cyclistic bike-share analysis/all_trips.csv”, row.names=FALSE)
counts <- aggregate(all_trips\(ride_length~all_trips\)member_casual+all_trips$day_of_week, FUN = mean)
write.csv(counts, “~/Project 1 Cyclistic bike-share analysis/avg_ride_length.csv”, row.names=FALSE)