{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"collapsed_sections":["B9zEH7lcwIY-"],"authorship_tag":"ABX9TyP97yrn425DiLa2CzLtDJf1"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"markdown","source":["# This notebook is intended to sample data from source"],"metadata":{"id":"OgJvkbgP881P"}},{"cell_type":"markdown","source":["## Import essential libraries"],"metadata":{"id":"B9zEH7lcwIY-"}},{"cell_type":"code","source":["# import libraries\n","import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","import seaborn as sns\n","import glob\n","import os"],"metadata":{"id":"DSlfFQP39CYF","executionInfo":{"status":"ok","timestamp":1724876999278,"user_tz":240,"elapsed":3730,"user":{"displayName":"Wangsheng Wu","userId":"00552226409306193312"}}},"execution_count":1,"outputs":[]},{"cell_type":"code","source":["# Mount the drive\n","from google.colab import drive\n","drive.mount('/content/drive')"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"scbbbV859E-M","executionInfo":{"status":"ok","timestamp":1724877080857,"user_tz":240,"elapsed":81584,"user":{"displayName":"Wangsheng Wu","userId":"00552226409306193312"}},"outputId":"da0687ec-971c-45b9-81b4-90a5a602353d"},"execution_count":2,"outputs":[{"output_type":"stream","name":"stdout","text":["Mounted at /content/drive\n"]}]},{"cell_type":"markdown","source":["## Some functions to help us"],"metadata":{"id":"lDHWKbfrwMWt"}},{"cell_type":"code","source":["def retrieve_month_data(year, month, folder_path):\n"," # Format the year and month to match the file naming convention\n"," year_str = str(year)\n"," month_str = f'{month:02d}'\n","\n"," # Define the pattern to match files for the given year and month\n"," file_pattern = f'citi_{year_str}{month_str}_*.csv'\n","\n"," file_paths = glob.glob(os.path.join(folder_path, file_pattern))\n","\n"," dfs = []\n","\n"," for file in file_paths:\n"," df = pd.read_csv(file)\n"," dfs.append(df)\n","\n"," # Combine all the dataframes into one\n"," combined_df = pd.concat(dfs, ignore_index=True)\n","\n"," return combined_df"],"metadata":{"id":"GHQzjWzI9Gd4","executionInfo":{"status":"ok","timestamp":1724877467469,"user_tz":240,"elapsed":166,"user":{"displayName":"Wangsheng Wu","userId":"00552226409306193312"}}},"execution_count":3,"outputs":[]},{"cell_type":"code","source":["def retrieve_sampled_month_data(year, month, folder_path, fraction=0.01, random_state=42):\n"," all_month_data = retrieve_month_data(year, month, folder_path)\n"," sampled_df = all_month_data.sample(frac=fraction, random_state=42)\n"," sampled_df = sampled_df.drop(columns=['Unnamed: 0'])\n"," return sampled_df"],"metadata":{"id":"VSAoAQ3VsYlW","executionInfo":{"status":"ok","timestamp":1724877650088,"user_tz":240,"elapsed":144,"user":{"displayName":"Wangsheng Wu","userId":"00552226409306193312"}}},"execution_count":16,"outputs":[]},{"cell_type":"code","source":["def retrieve_sampled_year_data(year, folder_path, fraction=0.01):\n"," sampled_dfs = []\n","\n"," for month in range(1, 13):\n"," # Retrieve & Sample\n"," sampled_df = retrieve_sampled_month_data(year, month, folder_path, random_state=42)\n","\n"," # Append\n"," sampled_dfs.append(sampled_df)\n","\n"," # Combine\n"," combined_sampled_df = pd.concat(sampled_dfs, ignore_index=True)\n","\n"," #data type check\n"," combined_sampled_df = date_feature_engineering(combined_sampled_df)\n","\n"," return combined_sampled_df"],"metadata":{"id":"vcFkXrKn9MZn","executionInfo":{"status":"ok","timestamp":1724877469083,"user_tz":240,"elapsed":143,"user":{"displayName":"Wangsheng Wu","userId":"00552226409306193312"}}},"execution_count":5,"outputs":[]},{"cell_type":"code","source":["def date_feature_engineering(given_df):\n"," given_df['started_at'] = pd.to_datetime(given_df['started_at'])\n"," given_df['ended_at'] = pd.to_datetime(given_df['ended_at'])\n"," given_df['ride_date'] = given_df['started_at'].dt.date\n"," given_df['ride_year'] = given_df['started_at'].dt.year\n"," given_df['ride_month'] = given_df['started_at'].dt.month\n"," given_df['ride_day'] = given_df['started_at'].dt.day\n","\n"," # check if it's weekend or not\n"," given_df['ride_date'] = pd.to_datetime(given_df['ride_date'])\n"," given_df['is_weekend'] = given_df['ride_date'].dt.weekday.isin([5, 6])\n","\n"," # calculate the duration of the ride\n"," given_df['duration'] = given_df['ended_at'] - given_df['started_at']\n"," given_df['duration_minutes'] = round(given_df['duration'].dt.total_seconds() / 60, 1)\n","\n"," given_df = given_df.reset_index(drop=True)\n"," return given_df"],"metadata":{"id":"8cBpwuKU9j4w","executionInfo":{"status":"ok","timestamp":1724878049441,"user_tz":240,"elapsed":139,"user":{"displayName":"Wangsheng Wu","userId":"00552226409306193312"}}},"execution_count":35,"outputs":[]},{"cell_type":"markdown","source":["## Data Source & Retrieve Data\n","Data can be accessed from https://citibikenyc.com/system-data"],"metadata":{"id":"x7tUf-7Fc4uc"}},{"cell_type":"code","source":["folder_path = \"/content/drive/MyDrive/afterschool_projects/ny_bike/citibike-tripdata\"\n","data202407 = retrieve_sampled_month_data(2024, 7, folder_path, fraction=0.01, random_state=42)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Cccb0dLp9PlB","executionInfo":{"status":"ok","timestamp":1724877683278,"user_tz":240,"elapsed":27871,"user":{"displayName":"Wangsheng Wu","userId":"00552226409306193312"}},"outputId":"ae647b0f-58c1-43b2-c8d0-887e7099f7b6"},"execution_count":17,"outputs":[{"output_type":"stream","name":"stderr","text":[":14: DtypeWarning: Columns (8) have mixed types. Specify dtype option on import or set low_memory=False.\n"," df = pd.read_csv(file)\n",":14: DtypeWarning: Columns (8) have mixed types. Specify dtype option on import or set low_memory=False.\n"," df = pd.read_csv(file)\n",":14: DtypeWarning: Columns (8) have mixed types. Specify dtype option on import or set low_memory=False.\n"," df = pd.read_csv(file)\n",":14: DtypeWarning: Columns (8) have mixed types. Specify dtype option on import or set low_memory=False.\n"," df = pd.read_csv(file)\n",":14: DtypeWarning: Columns (6,8) have mixed types. Specify dtype option on import or set low_memory=False.\n"," df = pd.read_csv(file)\n"]}]},{"cell_type":"code","source":["data202407.shape"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"acstjDJr_GVT","executionInfo":{"status":"ok","timestamp":1724877992209,"user_tz":240,"elapsed":137,"user":{"displayName":"Wangsheng Wu","userId":"00552226409306193312"}},"outputId":"215ecf2a-5538-4ac4-c21c-29cb7756711b"},"execution_count":27,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(47086, 17)"]},"metadata":{},"execution_count":27}]},{"cell_type":"code","source":["data202407.isna().sum()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":610},"id":"pYEFYB5v_qDl","executionInfo":{"status":"ok","timestamp":1724878000354,"user_tz":240,"elapsed":308,"user":{"displayName":"Wangsheng Wu","userId":"00552226409306193312"}},"outputId":"669fe980-0a36-4f91-c3cd-afa2c5a7a069"},"execution_count":28,"outputs":[{"output_type":"execute_result","data":{"text/plain":["ride_id 0\n","rideable_type 0\n","started_at 0\n","ended_at 0\n","start_station_name 0\n","start_station_id 0\n","end_station_name 0\n","end_station_id 0\n","start_lat 0\n","start_lng 0\n","end_lat 0\n","end_lng 0\n","member_casual 0\n","ride_date 0\n","ride_year 0\n","ride_month 0\n","ride_day 0\n","dtype: int64"],"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
0
ride_id0
rideable_type0
started_at0
ended_at0
start_station_name0
start_station_id0
end_station_name0
end_station_id0
start_lat0
start_lng0
end_lat0
end_lng0
member_casual0
ride_date0
ride_year0
ride_month0
ride_day0
\n","

"]},"metadata":{},"execution_count":28}]},{"cell_type":"code","source":["data202407 = data202407.dropna()"],"metadata":{"id":"h7XS0ozF_sTc","executionInfo":{"status":"ok","timestamp":1724878001931,"user_tz":240,"elapsed":143,"user":{"displayName":"Wangsheng Wu","userId":"00552226409306193312"}}},"execution_count":29,"outputs":[]},{"cell_type":"code","source":["data202407.shape"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"4F5dLGU8_0sr","executionInfo":{"status":"ok","timestamp":1724878002864,"user_tz":240,"elapsed":139,"user":{"displayName":"Wangsheng Wu","userId":"00552226409306193312"}},"outputId":"811c3744-6b58-4487-c5c6-2ad66bed16e4"},"execution_count":30,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(47086, 17)"]},"metadata":{},"execution_count":30}]},{"cell_type":"code","source":["data202407.head()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":309},"id":"iLqIzbnJAGbN","executionInfo":{"status":"ok","timestamp":1724878004019,"user_tz":240,"elapsed":485,"user":{"displayName":"Wangsheng Wu","userId":"00552226409306193312"}},"outputId":"d5a88fef-5339-43e5-f751-81a87ad34ed4"},"execution_count":31,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" ride_id rideable_type started_at \\\n","1120985 C00A9E6382932E6A classic_bike 2024-07-09 12:03:39.969 \n","619370 031CE5705DA05DF7 electric_bike 2024-07-14 13:57:05.636 \n","2075471 20F7951EEAD5CEC9 electric_bike 2024-07-15 13:38:21.594 \n","123236 8BA0DF2DDB523E40 electric_bike 2024-07-12 18:17:36.944 \n","441030 A3C83CEB20C4A614 electric_bike 2024-07-09 14:24:17.723 \n","\n"," ended_at start_station_name start_station_id \\\n","1120985 2024-07-09 12:08:49.318 Broadway & W 157 St 8123.06 \n","619370 2024-07-14 14:23:00.581 E 23 St & 1 Ave 5929.01 \n","2075471 2024-07-15 13:42:36.363 Beaver St & Fayette St 4840.06 \n","123236 2024-07-12 18:22:01.011 E 115 St & Lexington Ave 7599.09 \n","441030 2024-07-09 14:39:30.770 W 44 St & 5 Ave 6551.02 \n","\n"," end_station_name end_station_id start_lat start_lng \\\n","1120985 St Nicholas Ave & W 155 St 8085.05 40.834027 -73.945108 \n","619370 E 93 St & 2 Ave 7286.02 40.736502 -73.978095 \n","2075471 Cedar St & Myrtle Ave 4751.01 40.700990 -73.939210 \n","123236 Lenox Ave & W 117 St 7655.22 40.797911 -73.942300 \n","441030 E 72 St & York Ave 6889.12 40.755067 -73.980111 \n","\n"," end_lat end_lng member_casual ride_date ride_year \\\n","1120985 40.830663 -73.941323 member 2024-07-09 2024 \n","619370 40.782454 -73.948920 member 2024-07-14 2024 \n","2075471 40.697842 -73.926241 member 2024-07-15 2024 \n","123236 40.802557 -73.949078 member 2024-07-12 2024 \n","441030 40.766638 -73.953483 member 2024-07-09 2024 \n","\n"," ride_month ride_day \n","1120985 7 9 \n","619370 7 14 \n","2075471 7 15 \n","123236 7 12 \n","441030 7 9 "],"text/html":["\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
ride_idrideable_typestarted_atended_atstart_station_namestart_station_idend_station_nameend_station_idstart_latstart_lngend_latend_lngmember_casualride_dateride_yearride_monthride_day
1120985C00A9E6382932E6Aclassic_bike2024-07-09 12:03:39.9692024-07-09 12:08:49.318Broadway & W 157 St8123.06St Nicholas Ave & W 155 St8085.0540.834027-73.94510840.830663-73.941323member2024-07-09202479
619370031CE5705DA05DF7electric_bike2024-07-14 13:57:05.6362024-07-14 14:23:00.581E 23 St & 1 Ave5929.01E 93 St & 2 Ave7286.0240.736502-73.97809540.782454-73.948920member2024-07-142024714
207547120F7951EEAD5CEC9electric_bike2024-07-15 13:38:21.5942024-07-15 13:42:36.363Beaver St & Fayette St4840.06Cedar St & Myrtle Ave4751.0140.700990-73.93921040.697842-73.926241member2024-07-152024715
1232368BA0DF2DDB523E40electric_bike2024-07-12 18:17:36.9442024-07-12 18:22:01.011E 115 St & Lexington Ave7599.09Lenox Ave & W 117 St7655.2240.797911-73.94230040.802557-73.949078member2024-07-122024712
441030A3C83CEB20C4A614electric_bike2024-07-09 14:24:17.7232024-07-09 14:39:30.770W 44 St & 5 Ave6551.02E 72 St & York Ave6889.1240.755067-73.98011140.766638-73.953483member2024-07-09202479
\n","
\n","
\n","\n","
\n"," \n","\n"," \n","\n"," \n","
\n","\n","\n","
\n"," \n","\n","\n","\n"," \n","
\n","\n","
\n","
\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","variable_name":"data202407","repr_error":"0"}},"metadata":{},"execution_count":31}]},{"cell_type":"code","source":["df = date_feature_engineering(data202407)"],"metadata":{"id":"PLKLPsvM_2gp","executionInfo":{"status":"ok","timestamp":1724878005100,"user_tz":240,"elapsed":153,"user":{"displayName":"Wangsheng Wu","userId":"00552226409306193312"}}},"execution_count":32,"outputs":[]},{"cell_type":"code","source":["df.columns"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"909_Z3MV_8cj","executionInfo":{"status":"ok","timestamp":1724878006986,"user_tz":240,"elapsed":137,"user":{"displayName":"Wangsheng Wu","userId":"00552226409306193312"}},"outputId":"03fc2640-4aa4-47c1-aaf6-579331ca1b59"},"execution_count":33,"outputs":[{"output_type":"execute_result","data":{"text/plain":["Index(['ride_id', 'rideable_type', 'started_at', 'ended_at',\n"," 'start_station_name', 'start_station_id', 'end_station_name',\n"," 'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng',\n"," 'member_casual', 'ride_date', 'ride_year', 'ride_month', 'ride_day',\n"," 'is_weekend', 'duration', 'duration_minutes'],\n"," dtype='object')"]},"metadata":{},"execution_count":33}]},{"cell_type":"code","source":["df.head()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":309},"id":"uUnwvLKIAnAd","executionInfo":{"status":"ok","timestamp":1724878012209,"user_tz":240,"elapsed":727,"user":{"displayName":"Wangsheng Wu","userId":"00552226409306193312"}},"outputId":"49618748-2610-4c9c-feec-13b25529cc0c"},"execution_count":34,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" ride_id rideable_type started_at \\\n","0 C00A9E6382932E6A classic_bike 2024-07-09 12:03:39.969 \n","1 031CE5705DA05DF7 electric_bike 2024-07-14 13:57:05.636 \n","2 20F7951EEAD5CEC9 electric_bike 2024-07-15 13:38:21.594 \n","3 8BA0DF2DDB523E40 electric_bike 2024-07-12 18:17:36.944 \n","4 A3C83CEB20C4A614 electric_bike 2024-07-09 14:24:17.723 \n","\n"," ended_at start_station_name start_station_id \\\n","0 2024-07-09 12:08:49.318 Broadway & W 157 St 8123.06 \n","1 2024-07-14 14:23:00.581 E 23 St & 1 Ave 5929.01 \n","2 2024-07-15 13:42:36.363 Beaver St & Fayette St 4840.06 \n","3 2024-07-12 18:22:01.011 E 115 St & Lexington Ave 7599.09 \n","4 2024-07-09 14:39:30.770 W 44 St & 5 Ave 6551.02 \n","\n"," end_station_name end_station_id start_lat start_lng end_lat \\\n","0 St Nicholas Ave & W 155 St 8085.05 40.834027 -73.945108 40.830663 \n","1 E 93 St & 2 Ave 7286.02 40.736502 -73.978095 40.782454 \n","2 Cedar St & Myrtle Ave 4751.01 40.700990 -73.939210 40.697842 \n","3 Lenox Ave & W 117 St 7655.22 40.797911 -73.942300 40.802557 \n","4 E 72 St & York Ave 6889.12 40.755067 -73.980111 40.766638 \n","\n"," end_lng member_casual ride_date ride_year ride_month ride_day \\\n","0 -73.941323 member 2024-07-09 2024 7 9 \n","1 -73.948920 member 2024-07-14 2024 7 14 \n","2 -73.926241 member 2024-07-15 2024 7 15 \n","3 -73.949078 member 2024-07-12 2024 7 12 \n","4 -73.953483 member 2024-07-09 2024 7 9 \n","\n"," is_weekend duration duration_minutes \n","0 False 0 days 00:05:09.349000 5.2 \n","1 True 0 days 00:25:54.945000 25.9 \n","2 False 0 days 00:04:14.769000 4.2 \n","3 False 0 days 00:04:24.067000 4.4 \n","4 False 0 days 00:15:13.047000 15.2 "],"text/html":["\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
ride_idrideable_typestarted_atended_atstart_station_namestart_station_idend_station_nameend_station_idstart_latstart_lngend_latend_lngmember_casualride_dateride_yearride_monthride_dayis_weekenddurationduration_minutes
0C00A9E6382932E6Aclassic_bike2024-07-09 12:03:39.9692024-07-09 12:08:49.318Broadway & W 157 St8123.06St Nicholas Ave & W 155 St8085.0540.834027-73.94510840.830663-73.941323member2024-07-09202479False0 days 00:05:09.3490005.2
1031CE5705DA05DF7electric_bike2024-07-14 13:57:05.6362024-07-14 14:23:00.581E 23 St & 1 Ave5929.01E 93 St & 2 Ave7286.0240.736502-73.97809540.782454-73.948920member2024-07-142024714True0 days 00:25:54.94500025.9
220F7951EEAD5CEC9electric_bike2024-07-15 13:38:21.5942024-07-15 13:42:36.363Beaver St & Fayette St4840.06Cedar St & Myrtle Ave4751.0140.700990-73.93921040.697842-73.926241member2024-07-152024715False0 days 00:04:14.7690004.2
38BA0DF2DDB523E40electric_bike2024-07-12 18:17:36.9442024-07-12 18:22:01.011E 115 St & Lexington Ave7599.09Lenox Ave & W 117 St7655.2240.797911-73.94230040.802557-73.949078member2024-07-122024712False0 days 00:04:24.0670004.4
4A3C83CEB20C4A614electric_bike2024-07-09 14:24:17.7232024-07-09 14:39:30.770W 44 St & 5 Ave6551.02E 72 St & York Ave6889.1240.755067-73.98011140.766638-73.953483member2024-07-09202479False0 days 00:15:13.04700015.2
\n","
\n","
\n","\n","
\n"," \n","\n"," \n","\n"," \n","
\n","\n","\n","
\n"," \n","\n","\n","\n"," \n","
\n","\n","
\n","
\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","variable_name":"df","summary":"{\n \"name\": \"df\",\n \"rows\": 47086,\n \"fields\": [\n {\n \"column\": \"ride_id\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 47086,\n \"samples\": [\n \"6CF923D27E1D98D3\",\n \"5A68720B9AB37AD6\",\n \"3A8A19939E6280F5\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"rideable_type\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"electric_bike\",\n \"classic_bike\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"started_at\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": \"2024-06-30 22:36:54.164000\",\n \"max\": \"2024-07-31 23:55:00.041000\",\n \"num_unique_values\": 47085,\n \"samples\": [\n \"2024-07-20 15:26:59.288000\",\n \"2024-07-06 13:25:27.091000\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"ended_at\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": \"2024-07-01 00:01:58.495000\",\n \"max\": \"2024-07-31 23:59:51.069000\",\n \"num_unique_values\": 47086,\n \"samples\": [\n \"2024-07-24 15:32:47.803000\",\n \"2024-07-20 15:20:28.703000\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"start_station_name\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2061,\n \"samples\": [\n \"3 Ave & Franklin Ave\",\n \"Great Jones St\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"start_station_id\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2093,\n \"samples\": [\n \"8172.07\",\n \"6191.06\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"end_station_name\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2067,\n \"samples\": [\n \"86 St & 35 Ave\",\n \"University Pl & E 14 St\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"end_station_id\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2818,\n \"samples\": [\n \"8144.09\",\n \"6515.01\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"start_lat\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.04119880903282505,\n \"min\": 40.633385,\n \"max\": 40.8863,\n \"num_unique_values\": 12854,\n \"samples\": [\n 40.703508139,\n 40.814295411\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"start_lng\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.029628605442546255,\n \"min\": -74.026823,\n \"max\": -73.84672,\n \"num_unique_values\": 12859,\n \"samples\": [\n -73.928224564,\n -73.911054134\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"end_lat\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.19224496834365118,\n \"min\": 0.0,\n \"max\": 40.8863,\n \"num_unique_values\": 2056,\n \"samples\": [\n 40.85472,\n 40.65496\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"end_lng\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.3421826658401405,\n \"min\": -74.03852552175522,\n \"max\": 0.0,\n \"num_unique_values\": 2052,\n \"samples\": [\n -73.914527,\n -74.00887308\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"member_casual\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"casual\",\n \"member\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"ride_date\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": \"2024-06-30 00:00:00\",\n \"max\": \"2024-07-31 00:00:00\",\n \"num_unique_values\": 32,\n \"samples\": [\n \"2024-07-05 00:00:00\",\n \"2024-07-28 00:00:00\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"ride_year\",\n \"properties\": {\n \"dtype\": \"int32\",\n \"num_unique_values\": 1,\n \"samples\": [\n 2024\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"ride_month\",\n \"properties\": {\n \"dtype\": \"int32\",\n \"num_unique_values\": 2,\n \"samples\": [\n 6\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"ride_day\",\n \"properties\": {\n \"dtype\": \"int32\",\n \"num_unique_values\": 31,\n \"samples\": [\n 30\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"is_weekend\",\n \"properties\": {\n \"dtype\": \"boolean\",\n \"num_unique_values\": 2,\n \"samples\": [\n true\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"duration\",\n \"properties\": {\n \"dtype\": \"timedelta64[ns]\",\n \"num_unique_values\": 46321,\n \"samples\": [\n \"0 days 00:03:03.388000\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"duration_minutes\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 20.797872939299097,\n \"min\": 1.0,\n \"max\": 1385.3,\n \"num_unique_values\": 975,\n \"samples\": [\n 31.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}"}},"metadata":{},"execution_count":34}]},{"cell_type":"code","source":["# data might contain rides started in June and end in July\n","# They make up a very little portion of the dataset\n","# For the sake of analyzing more easily, we only consider\n","# rides started in July and end in July here.\n","df = df[df['ride_month']==7]"],"metadata":{"id":"DGVokdV4LTxy","executionInfo":{"status":"ok","timestamp":1724878169898,"user_tz":240,"elapsed":145,"user":{"displayName":"Wangsheng Wu","userId":"00552226409306193312"}}},"execution_count":36,"outputs":[]},{"cell_type":"markdown","source":["## Export the sampled Data"],"metadata":{"id":"TLI-UdDvdHWU"}},{"cell_type":"code","source":["df.to_csv('/content/drive/MyDrive/afterschool_projects/ny_bike/citi_sampled_data_202407.csv', index=False)"],"metadata":{"id":"iPHoNYuEKl9q","executionInfo":{"status":"ok","timestamp":1724878299012,"user_tz":240,"elapsed":2344,"user":{"displayName":"Wangsheng Wu","userId":"00552226409306193312"}}},"execution_count":37,"outputs":[]},{"cell_type":"markdown","source":["## Some Plots"],"metadata":{"id":"2-sVjWIUdEDa"}},{"cell_type":"code","source":["df_july = df\n","grouped_data = df_july.groupby(['ride_day', 'member_casual', 'is_weekend'])['duration_minutes'].mean().unstack('member_casual', fill_value=0).reset_index()\n","\n","# Set up the plot\n","fig, ax = plt.subplots(figsize=(12, 8))\n","\n","# Define custom RGB colors for member and casual\n","color_member_weekday = (0.1, 0.4, 0.8) # Darker blue for member (weekday)\n","color_casual_weekday = (0.6, 0.8, 1.0) # Lighter blue for casual (weekday)\n","color_member_weekend = (0.9, 0.4, 0.0) # Darker orange for member (weekend)\n","color_casual_weekend = (1.0, 0.7, 0.5) # Lighter orange for casual (weekend)\n","\n","# Loop through the grouped data and plot the bars\n","for index, row in grouped_data.iterrows():\n"," day = row['ride_day']\n"," is_weekend = row['is_weekend']\n"," member_duration = row.get('member', 0)\n"," casual_duration = row.get('casual', 0)\n","\n"," # Determine the colors based on weekend or weekday\n"," if is_weekend:\n"," member_color = color_member_weekend\n"," casual_color = color_casual_weekend\n"," else:\n"," member_color = color_member_weekday\n"," casual_color = color_casual_weekday\n","\n"," # Plot the bars for member and casual, stacking casual on top of member\n"," ax.bar(day, member_duration, color=member_color, label='Member' if index == 0 else \"\")\n"," ax.bar(day, casual_duration, bottom=member_duration, color=casual_color, label='Casual' if index == 0 else \"\")\n","\n","# Add labels and title\n","ax.set_xlabel('Day of the Month', fontsize=12)\n","ax.set_ylabel('Average Duration (minutes)', fontsize=12)\n","ax.set_title('Average Ride Duration per Day in July 2024', fontsize=16)\n","\n","# Add a legend\n","ax.legend()\n","\n","# Show the plot\n","plt.tight_layout()\n","plt.show()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":807},"id":"s0JloG8GGK_X","executionInfo":{"status":"ok","timestamp":1724878301673,"user_tz":240,"elapsed":833,"user":{"displayName":"Wangsheng Wu","userId":"00552226409306193312"}},"outputId":"7d199f8c-50c7-4488-c81c-99623e538c4c"},"execution_count":38,"outputs":[{"output_type":"display_data","data":{"text/plain":["
"],"image/png":"\n"},"metadata":{}}]},{"cell_type":"code","source":["df_july = df\n","grouped_data = df_july.groupby(['ride_day', 'is_weekend'])['duration_minutes'].mean().reset_index()\n","\n","# Set up the plot\n","fig, ax = plt.subplots(figsize=(12, 8))\n","\n","# Define custom RGB colors for weekdays and weekends\n","color_weekday = (0.1, 0.4, 0.8) # Blue for weekday\n","color_weekend = (0.9, 0.4, 0.0) # Orange for weekend\n","\n","# Loop through the grouped data and plot the bars\n","for index, row in grouped_data.iterrows():\n"," day = row['ride_day']\n"," is_weekend = row['is_weekend']\n"," avg_duration = row['duration_minutes']\n","\n"," # Determine the color based on weekend or weekday\n"," color = color_weekend if is_weekend else color_weekday\n","\n"," # Plot the bar for the average duration\n"," ax.bar(day, avg_duration, color=color)\n","\n","# Add labels and title\n","ax.set_xlabel('Day of the Month', fontsize=12)\n","ax.set_ylabel('Average Duration (minutes)', fontsize=12)\n","ax.set_title('Average Ride Duration per Day in July 2024', fontsize=16)\n","\n","# Show the plot\n","plt.tight_layout()\n","plt.show()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":807},"id":"LS_inK9ZGXT-","executionInfo":{"status":"ok","timestamp":1724878302699,"user_tz":240,"elapsed":542,"user":{"displayName":"Wangsheng Wu","userId":"00552226409306193312"}},"outputId":"b5fe289d-2c8f-4ca7-a3e7-cac0ecb2bc28"},"execution_count":39,"outputs":[{"output_type":"display_data","data":{"text/plain":["
"],"image/png":"\n"},"metadata":{}}]},{"cell_type":"code","source":[],"metadata":{"id":"DrAu1eX_KrRV"},"execution_count":null,"outputs":[]}]}