# Installing the libraries with the specified version.
#!pip install numpy==1.25.2 pandas==1.5.3 matplotlib==3.7.1 seaborn==0.13.1 -q --user

# import libraries for data manipulation
import numpy as np
import pandas as pd

# import libraries for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# uncomment and run the following lines for Google Colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

# Write your code here to read the data
df = pd.read_csv('/content/drive/MyDrive/FoodHub/foodhub_order.csv')

# Write your code here to view the first 5 rows
df.head()

# Write your code here
df.shape

(1898, 9)

# Write your code here
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1898 entries, 0 to 1897
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   order_id               1898 non-null   int64  
 1   customer_id            1898 non-null   int64  
 2   restaurant_name        1898 non-null   object 
 3   cuisine_type           1898 non-null   object 
 4   cost_of_the_order      1898 non-null   float64
 5   day_of_the_week        1898 non-null   object 
 6   rating                 1898 non-null   object 
 7   food_preparation_time  1898 non-null   int64  
 8   delivery_time          1898 non-null   int64  
dtypes: float64(1), int64(4), object(4)
memory usage: 133.6+ KB

# Write your code here
df.isnull().sum()

# Write your code here
df.describe().T

# Write the code here
df['rating'].value_counts()

# Set style
sns.set(style="whitegrid")

# Count plot for cuisine_type
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x='cuisine_type', order=df['cuisine_type'].value_counts().index, hue='cuisine_type' ,palette='coolwarm')
plt.title('Count Plot of Cuisine Type', fontsize=16)
plt.xlabel('Cuisine Type')
plt.ylabel('Number of Orders')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Set style
sns.set(style="whitegrid")

# Count plot for day_of_the_week
plt.figure(figsize=(8, 5))
sns.countplot(data=df, x='day_of_the_week', order=df['day_of_the_week'].value_counts().index, hue='day_of_the_week', palette='coolwarm')
plt.title('Count Plot of Day of the Week', fontsize=16)
plt.xlabel('Day of the Week')
plt.ylabel('Number of Orders')
plt.tight_layout()
plt.show()

# Set plot style
sns.set(style="whitegrid")

# Count plot for rating
plt.figure(figsize=(10, 5))
sns.countplot(data=df, x='rating', order=df['rating'].value_counts().index, hue='rating', palette='coolwarm')
plt.title('Count Plot of Ratings', fontsize=16)
plt.xlabel('Rating')
plt.ylabel('Number of Orders')
plt.tight_layout()
plt.show()

# Set plot style
sns.set(style="whitegrid")

# Histogram for cost_of_the_order
plt.figure(figsize=(10, 5))
sns.histplot(data=df, x='cost_of_the_order', kde=True, bins=30, color='red')
plt.title('Histogram of Cost of the Order', fontsize=16)
plt.xlabel('Cost of the Order ($)')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

# Set visual style
sns.set(style="whitegrid")

# Boxplot for cost_of_the_order
sns.boxplot(x=df['cost_of_the_order'], color='red')
plt.title('Boxplot of Cost of the Order', fontsize=16)
plt.xlabel('Cost of the Order ($)')
plt.tight_layout()
plt.show()

# Set visual style
sns.set(style="whitegrid")

# Histogram for food_preparation_time
plt.figure(figsize=(10, 5))
sns.histplot(data=df, x='food_preparation_time', kde=True, bins=20, color='red')
plt.title('Histogram of Food Preparation Time', fontsize=16)
plt.xlabel('Food Preparation Time (minutes)')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

# Set visual style
sns.set(style="whitegrid")

# Boxplot for food_preparation_time
sns.boxplot(x=df['food_preparation_time'], color='red')
plt.title('Boxplot of Food Preparation Time', fontsize=16)
plt.xlabel('Food Preparation Time (minutes)')
plt.tight_layout()
plt.show()

# Set style
sns.set(style="whitegrid")

# Histogram for delivery_time
plt.figure(figsize=(10, 5))
sns.histplot(data=df, x='delivery_time', kde=True, bins=20, color='red')
plt.title('Histogram of Delivery Time', fontsize=16)
plt.xlabel('Delivery Time (minutes)')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

# Set visual style
sns.set(style="whitegrid")

# Boxplot for delivery_time
sns.boxplot(x=df['delivery_time'], color='red')
plt.title('Boxplot of Delivery Time', fontsize=16)
plt.xlabel('Delivery Time (minutes)')
plt.tight_layout()
plt.show()

# Write the code here
# Calculate the top 5 restaurants in terms of number of orders received
top_5_restaurants = df['restaurant_name'].value_counts().head(5)

# Print the result
print("Top 5 Restaurants by Number of Orders:")
print(top_5_restaurants)

Top 5 Restaurants by Number of Orders:
restaurant_name
Shake Shack                  219
The Meatball Shop            132
Blue Ribbon Sushi            119
Blue Ribbon Fried Chicken     96
Parm                          68
Name: count, dtype: int64

# Write the code here
# Filter the data for weekend orders
weekend_orders = df[df['day_of_the_week'] == 'Weekend']

# Find the most popular cuisine on weekends
popular_cuisine_weekend = weekend_orders['cuisine_type'].value_counts().idxmax()

# Print the result
print(f"The most popular cuisine on weekends is: {popular_cuisine_weekend}")

# Count the number of orders per cuisine on weekends
weekend_cuisine_counts = weekend_orders['cuisine_type'].value_counts()

# Print the result
print("Number of Orders per Cuisine on Weekends:")
print(weekend_cuisine_counts)

The most popular cuisine on weekends is: American
Number of Orders per Cuisine on Weekends:
cuisine_type
American          415
Japanese          335
Italian           207
Chinese           163
Mexican            53
Indian             49
Middle Eastern     32
Mediterranean      32
Thai               15
French             13
Korean             11
Southern           11
Spanish            11
Vietnamese          4
Name: count, dtype: int64

# Write the code here
# Calculate the number of orders with cost greater than $20
orders_above_20 = df[df['cost_of_the_order'] > 20].shape[0]

# Calculate the total number of orders
total_orders = df.shape[0]

# Calculate the percentage
percentage_above_20 = (orders_above_20 / total_orders) * 100

# Print the result
print(f"Percentage of orders costing more than $20: {percentage_above_20:.2f}%")

Percentage of orders costing more than $20: 29.24%

# Write the code here
# Calculate the mean order delivery time
mean_delivery_time = df['delivery_time'].mean()

# Print the result
print(f"Mean Order Delivery Time: {mean_delivery_time:.2f} minutes")

Mean Order Delivery Time: 24.16 minutes

# Write the code here
# Find the top 3 most frequent customers by number of orders
top_customers = df['customer_id'].value_counts().head(3)

# Print the result
print("Top 3 Most Frequent Customers (Eligible for 20% Discount Voucher):")
print(top_customers)

Top 3 Most Frequent Customers (Eligible for 20% Discount Voucher):
customer_id
52832    13
47440    10
83287     9
Name: count, dtype: int64

# Select numerical columns
numerical_columns = ['cost_of_the_order', 'food_preparation_time', 'delivery_time']

# Calculate correlation matrix
corr_matrix = df[numerical_columns].corr()

# Plot the heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap of Numerical Variables')
plt.show()

plt.figure(figsize=(12, 6))
sns.boxplot(x='cuisine_type', y='delivery_time', data=df, hue='cuisine_type', order=df['cuisine_type'].value_counts().index, palette='coolwarm')
plt.xticks(rotation=45)
plt.title('Delivery Time by Cuisine Type')
plt.xlabel('Cuisine Type')
plt.ylabel('Delivery Time (minutes)')
plt.show()

plt.figure(figsize=(8, 6))
sns.boxplot(x='day_of_the_week', y='delivery_time', hue='day_of_the_week', data=df, palette='coolwarm')
plt.title('Delivery Time by Day of the Week')
plt.xlabel('Day of the Week')
plt.ylabel('Delivery Time (minutes)')
plt.show()

plt.figure(figsize=(12, 6))
sns.boxplot(x='cuisine_type', y='cost_of_the_order', data=df, hue='cuisine_type', order=df['cuisine_type'].value_counts().index, palette='coolwarm')
plt.xticks(rotation=45)
plt.title('Cost of the Order by Cuisine Type')
plt.xlabel('Cuisine Type')
plt.ylabel('Cost of the Order ($)')
plt.show()

plt.figure(figsize=(12, 6))
sns.boxplot(x='cuisine_type', y='food_preparation_time', data=df, hue='cuisine_type', order=df['cuisine_type'].value_counts().index, palette='coolwarm')
plt.xticks(rotation=45)
plt.title('Food Preparation Time by Cuisine Type')
plt.xlabel('Cuisine Type')
plt.ylabel('Food Preparation Time (minutes)')
plt.show()

plt.figure(figsize=(10, 6))
sns.pointplot(x='rating', y='delivery_time', data=df, color='red')
plt.title('Mean Delivery Time by Rating')
plt.xlabel('Rating')
plt.ylabel('Delivery Time (minutes)')
plt.show()

plt.figure(figsize=(10, 6))
sns.pointplot(x='rating', y='food_preparation_time', data=df, color='red')
plt.title('Mean Food Preparation Time by Rating')
plt.xlabel('Rating')
plt.ylabel('Food Preparation Time (minutes)')
plt.show()

plt.figure(figsize=(10, 6))
sns.pointplot(x='rating', y='cost_of_the_order', data=df, color='red')
plt.title('Mean Cost of the Order by Rating')
plt.xlabel('Rating')
plt.ylabel('Cost of the Order ($)')
plt.show()

# Write the code here
# Remove entries where rating is 'Not given' and convert ratings to numeric
df_filtered = df[df['rating'] != 'Not given'].copy()
df_filtered['rating'] = pd.to_numeric(df_filtered['rating'])

# Group by restaurant and calculate the rating count and average rating
restaurant_rating_stats = df_filtered.groupby('restaurant_name')['rating'].agg(['count', 'mean']).reset_index()

# Filter restaurants that meet the promotional offer criteria
eligible_restaurants = restaurant_rating_stats[
    (restaurant_rating_stats['count'] > 50) &
    (restaurant_rating_stats['mean'] > 4)
]

# Display the eligible restaurants
print(eligible_restaurants)

               restaurant_name  count      mean
16   Blue Ribbon Fried Chicken     64  4.328125
17           Blue Ribbon Sushi     73  4.219178
117                Shake Shack    133  4.278195
132          The Meatball Shop     84  4.511905

# Write the code here
# Define a function to calculate revenue per order
def calculate_revenue(cost):
    if cost > 20:
        return cost * 0.25
    elif cost > 5:
        return cost * 0.15
    else:
        return 0.0

# Apply the function to compute revenue for each order
df['revenue'] = df['cost_of_the_order'].apply(calculate_revenue)

# Calculate total revenue
total_revenue = df['revenue'].sum()

# Print the result
print(f"Net Revenue Generated by the Company: ${total_revenue:.2f}")

Net Revenue Generated by the Company: $6166.30

# Write the code here
# Calculate the total delivery time (preparation + delivery)
df['total_delivery_time'] = df['food_preparation_time'] + df['delivery_time']

# Count total number of orders
total_orders = len(df)

# Count how many orders took more than 60 minutes
orders_over_60 = df[df['total_delivery_time'] > 60].shape[0]

# Calculate the percentage
percentage_over_60 = (orders_over_60 / total_orders) * 100

# Print the result
print(f"Total number of orders: {total_orders}")
print(f"Number of orders with delivery time > 60 minutes: {orders_over_60}")
print(f"Percentage of orders taking more than 60 minutes: {percentage_over_60:.2f}%")

Total number of orders: 1898
Number of orders with delivery time > 60 minutes: 200
Percentage of orders taking more than 60 minutes: 10.54%

# Write the code here
# Group by 'day_of_the_week' and calculate mean delivery time
mean_delivery_time_by_day = df.groupby('day_of_the_week')['delivery_time'].mean().reset_index()

# Print the results
print("Mean Delivery Time by Day Type:")
for _, row in mean_delivery_time_by_day.iterrows():
    print(f"{row['day_of_the_week']}: {row['delivery_time']:.2f} minutes")

Mean Delivery Time by Day Type:
Weekday: 28.34 minutes
Weekend: 22.47 minutes

	count	mean	std	min	25%	50%	75%	max
order_id	1898.0	1.477496e+06	548.049724	1476547.00	1477021.25	1477495.50	1.477970e+06	1478444.00
customer_id	1898.0	1.711685e+05	113698.139743	1311.00	77787.75	128600.00	2.705250e+05	405334.00
cost_of_the_order	1898.0	1.649885e+01	7.483812	4.47	12.08	14.14	2.229750e+01	35.41
food_preparation_time	1898.0	2.737197e+01	4.632481	20.00	23.00	27.00	3.100000e+01	35.00
delivery_time	1898.0	2.416175e+01	4.972637	15.00	20.00	25.00	2.800000e+01	33.00

	count
rating
Not given	736
5	588
4	386
3	188

Rank	Customer ID	Number of Orders
1	52832	13 orders
2	47440	10 orders
3	83287	9 orders

	order_id	customer_id	restaurant_name	cuisine_type	cost_of_the_order	day_of_the_week	rating	food_preparation_time	delivery_time
0	1477147	337525	Hangawi	Korean	30.75	Weekend	Not given	25	20
1	1477685	358141	Blue Ribbon Sushi Izakaya	Japanese	12.08	Weekend	Not given	25	23
2	1477070	66393	Cafe Habana	Mexican	12.23	Weekday	5	23	28
3	1477334	106968	Blue Ribbon Fried Chicken	American	29.20	Weekend	3	25	15
4	1478249	76942	Dirty Bird to Go	American	11.59	Weekday	4	25	24

	0
order_id	0
customer_id	0
restaurant_name	0
cuisine_type	0
cost_of_the_order	0
day_of_the_week	0
rating	0
food_preparation_time	0
delivery_time	0

Project Python Foundations: FoodHub Data Analysis¶

Context¶

Objective¶

Data Description¶

Data Dictionary¶

Let us start by importing the required libraries¶

Understanding the structure of the data¶

Question 1: How many rows and columns are present in the data? [0.5 mark]¶

Observations:¶

Question 2: What are the datatypes of the different columns in the dataset? (The info() function can be used) [0.5 mark]¶

Observations:¶

Question 3: Are there any missing values in the data? If yes, treat them using an appropriate method. [1 mark]¶

Observations:¶

Question 4: Check the statistical summary of the data. What is the minimum, average, and maximum time it takes for food to be prepared once an order is placed? [2 marks]¶

Observations:¶

Question 5: How many orders are not rated? [1 mark]¶

Observations:¶

Exploratory Data Analysis (EDA)¶

Univariate Analysis¶

Question 6: Explore all the variables and provide observations on their distributions. (Generally, histograms, boxplots, countplots, etc. are used for univariate exploration.) [9 marks]¶

Count Plot of Cuisine Type:¶

Observations:¶

Count Plot of Day of the week:¶

Observations:¶

Count Plot of Rating:¶

Observations:¶

Histogram of Cost of the Order:¶

Observations:¶

Boxplot of Cost of the Orders:¶

Observations:¶

Histogram of Food Preparation Time:¶

Observations:¶

Boxplot of Food Preparation Time:¶

Observations:¶

Histogram of Delivery Time:¶

Observations:¶

Boxplot of Delivery Time:¶

Observations:¶

Question 7: Which are the top 5 restaurants in terms of the number of orders received? [1 mark]¶

Observations:¶

Question 8: Which is the most popular cuisine on weekends? [1 mark]¶

Observations:¶

Question 9: What percentage of the orders cost more than 20 dollars? [2 marks]¶

Observations:¶

Question 10: What is the mean order delivery time? [1 mark]¶

Observations:¶

Question 11: The company has decided to give 20% discount vouchers to the top 3 most frequent customers. Find the IDs of these customers and the number of orders they placed. [1 mark]¶

Observations:¶

Multivariate Analysis¶

Question 12: Perform a multivariate analysis to explore relationships between the important variables in the dataset. (It is a good idea to explore relations between numerical variables as well as relations between numerical and categorical variables) [10 marks]¶

Correlation Heatmap (Numerical Variables)¶

Observations:¶

Delivery Time by Cuisine Type¶

Observations:¶

Delivery Time by Day of the Week¶

Observations:¶

Cost of the Order by Cuisine Type¶

Observations:¶

Cuisine vs. Food Preparation Time¶

Observations:¶

Mean Delivery Time vs. Rating¶

Observations:¶

Mean Food Preparation Time vs. Rating¶

Observations:¶

Mean Cost of the Order vs. Rating¶

Observations:¶

Observations:¶

Question 14: The company charges the restaurant 25% on the orders having cost greater than 20 dollars and 15% on the orders having cost greater than 5 dollars. Find the net revenue generated by the company across all orders. [3 marks]¶

Observations:¶

Question 15: The company wants to analyze the total time required to deliver the food. What percentage of orders take more than 60 minutes to get delivered from the time the order is placed? (The food has to be prepared and then delivered.) [2 marks]¶

Observations:¶

Question 16: The company wants to analyze the delivery time of the orders on weekdays and weekends. How does the mean delivery time vary during weekdays and weekends? [2 marks]¶

Observations:¶

Conclusion and Recommendations¶

Question 17: What are your conclusions from the analysis? What recommendations would you like to share to help improve the business? (You can use cuisine type and feedback ratings to drive your business recommendations.) [6 marks]¶

Conclusions:¶

Recommendations:¶