Data Days for Good 2023, MassMutual
“Analyzing CS literacy in MA districts % counties wise addressing financial or social inequities
- Filling Missing Values
- Required Binning of atleast 1 CS Class Participation into 3 Bins
- Creation of Others Dataframe (Asian + White)
- Joining of Dataframes (CS Classes Participation + AP Test Scores)
- Pandas Profiling
- Dual Box Plots Creation on Basis of CS Course Offered or CS Course Not-Offered:
!ls
import pandas as pd
df1 = pd.read_excel("datasets/AfricAmerican-Black/artcourse-afrc-amer-cs.xlsx", header=1)
df3 = pd.read_excel("datasets/Hispanic-Latino/artcourse-hisplat-cs.xlsx", header=1)
df1.head(5)
df3.head(5)
import pandas as pd
df1_ap = pd.read_excel("datasets/AfricAmerican-Black/ap_performance_afr_amerc.xlsx", header=1)
df3_ap = pd.read_excel("datasets/Hispanic-Latino/ap_performance_hisp_latino.xlsx", header=1)
df1_ap.head(5)
df3_ap.head(5)
import pandas as pd
dfw = pd.read_excel("datasets/Others-(Asian + White)/artcourse-white.xlsx", header=1)
dfa = pd.read_excel("datasets/Others-(Asian + White)/artcourse-asian.xlsx", header=1)
import pandas as pd
dfw_ap = pd.read_excel("datasets/Others-(Asian + White)/ap_performance_white.xlsx", header=1)
dfa_ap = pd.read_excel("datasets/Others-(Asian + White)/ap_performance_asian.xlsx", header=1)
print("Data types of dfw_ap:")
print(dfw_ap.dtypes)
print("\nLength of dfw_ap:", len(dfw_ap))
print("\nData types of dfa_ap:")
print(dfa_ap.dtypes)
print("\nLength of dfa_ap:", len(dfa_ap))
print("\nData types of dfw:")
print(dfw.dtypes)
print("\nLength of dfw:", len(dfw))
print("\nData types of dfa:")
print(dfa.dtypes)
print("\nLength of dfa:", len(dfa))
# dfw & dfw_ap
columns = ['K', '01', '02', '03', '04', '05', '06', '07', '08', '09', 'All Grades', 'Total Students']
for column in columns:
dfw[column] = dfw[column].str.replace(',', '').str.strip()
dfw[column] = dfw[column].astype(float)
columns = ['Tests Taken', 'Score=1', 'Score=2', 'Score=3', 'Score=4', 'Score=5']
for column in columns:
dfw_ap[column] = dfw_ap[column].str.replace(',', '').str.strip()
dfw_ap[column] = dfw_ap[column].astype(float)
# dfa & dfa_ap
columns = ['05', '06', '07', '08', 'All Grades', 'Total Students']
for column in columns:
dfa[column] = dfa[column].str.replace(',', '').str.strip()
dfa[column] = dfa[column].astype(float)
columns = ['Tests Taken', 'Score=4', 'Score=5']
for column in columns:
dfa_ap[column] = dfa_ap[column].str.replace(',', '').str.strip()
dfa_ap[column] = dfa_ap[column].astype(float)
# Removing commas and whitespaces from 'Tests Taken' column
df1['All Grades'] = df1['All Grades'].str.replace(',', '').str.strip()
# Casting 'Tests Taken' column to float
df1['All Grades'] = df1['All Grades'].astype(float)
df1['Total Students'] = df1['Total Students'].str.replace(',', '').str.strip()
# Casting 'Tests Taken' column to float
df1['Total Students'] = df1['Total Students'].astype(float)
df1_ap['Tests Taken'] = df1_ap['Tests Taken'].str.replace(',', '').str.strip()
# Casting 'Tests Taken' column to float
df1_ap['Tests Taken'] = df1_ap['Tests Taken'].astype(float)
df3_ap['Tests Taken'] = df3_ap['Tests Taken'].str.replace(',', '').str.strip()
# Casting 'Tests Taken' column to float
df3_ap['Tests Taken'] = df3_ap['Tests Taken'].astype(float)
df3['All Grades'] = df3['All Grades'].str.replace(',', '').str.strip()
# Casting 'Tests Taken' column to float
df3['All Grades'] = df3['All Grades'].astype(float)
df3['Total Students'] = df3['Total Students'].str.replace(',', '').str.strip()
# Casting 'Tests Taken' column to float
df3['Total Students'] = df3['Total Students'].astype(float)
df3['09'] = df3['09'].str.replace(',', '').str.strip()
# Casting 'Tests Taken' column to float
df3['09'] = df3['09'].astype(float)
print(df1.dtypes, df3.dtypes, df1_ap.dtypes, df3_ap.dtypes)
print(dfw.dtypes, dfw_ap.dtypes, dfa.dtypes, dfa_ap.dtypes)
print(df1.isnull().sum(), # column wise null check
df3.isnull().sum()) # column wise null check
df1_ap.isnull().sum(), # column wise null check
df3_ap.isnull().sum()#
df3.dtypes
df1['AfriAmerican-Black-%'] = df1['All Grades'] * 100 / df1['Total Students']
import pandas as pd
import matplotlib.pyplot as plt
sorted_df = df1.sort_values(by='AfriAmerican-Black-%', ascending= False)
top_n = 40
top_districts = sorted_df.head(top_n)
plt.figure(figsize=(10, 12)) # Increase the figure size to show more districts
plt.barh(top_districts['District Name'], top_districts['AfriAmerican-Black-%'])
plt.xlabel('AfriAmerican-Black-%')
plt.ylabel('District Name')
plt.title(f'Top {top_n} Districts with Highest African-American Percentage')
plt.show()
df3['Hispanic-Latino-%'] = df3['All Grades'] * 100 / df3['Total Students']
import pandas as pd
import matplotlib.pyplot as plt
sorted_df = df3.sort_values(by='Hispanic-Latino-%', ascending= False)
top_n = 40
top_districts = sorted_df.head(top_n)
# Plot the top districts
plt.figure(figsize=(10, 12)) # Increase the figure size to show more districts
plt.barh(top_districts['District Name'], top_districts['Hispanic-Latino-%'])
plt.xlabel('Hispanic-Latino-%')
plt.ylabel('District Name')
plt.title(f'Top {top_n} Districts with Highest Hispanic-Latino Percentage')
plt.show()
df1_ap.fillna(0, inplace=True)
df3_ap.fillna(0, inplace=True)
df1.fillna(0, inplace=True)
df3.fillna(0, inplace=True)
dfa.fillna(0, inplace=True)
dfw.fillna(0, inplace=True)
dfa_ap.fillna(0, inplace=True)
dfw_ap.fillna(0, inplace=True)
num_zeros1 = (df1_ap == 0).sum().sum()
num_zeros3 = (df3_ap == 0).sum().sum()
num_zeros1, num_zeros3
data_frames = [df1_ap, df3_ap, df1, df3, dfw, dfw_ap, dfa, dfa_ap]
for i in range(4):
print(data_frames[i].describe())
print("-------------------")
df1.dtypes
df1_ap.dtypes
df3.dtypes
selected_columns1 = ['K', '01', '02', '03']
df1['Primary'] = df1[selected_columns1].sum(axis=1)
df3['Primary'] = df3[selected_columns1].sum(axis=1)
dfa['Primary'] = dfa[selected_columns1].sum(axis=1)
dfw['Primary'] = dfw[selected_columns1].sum(axis=1)
df1['Primary'].describe()
selected_columns2 = ['04', '05','06','07','08']
# Computing the sum along columns axis for each row
df1['Secondary'] = df1[selected_columns2].sum(axis=1)
df3['Secondary'] = df3[selected_columns2].sum(axis=1)
dfa['Secondary'] = dfa[selected_columns2].sum(axis=1)
dfw['Secondary'] = dfw[selected_columns2].sum(axis=1)
selected_columns3 = ['09', '10','11','12']
# Computing the sum along columns axis for each row
df1['High'] = df1[selected_columns3].sum(axis=1)
df3['High'] = df3[selected_columns3].sum(axis=1)
dfa['High'] = dfa[selected_columns3].sum(axis=1)
dfw['High'] = dfw[selected_columns3].sum(axis=1)
df3.isnull().sum() # column wise null check
columns_to_drop = ['K', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
df1.drop(columns=columns_to_drop, inplace=True)
columns_to_drop = ['K', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
df3.drop(columns=columns_to_drop, inplace=True)
columns_to_drop = ['K', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
dfa.drop(columns=columns_to_drop, inplace=True)
columns_to_drop = ['K', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
dfw.drop(columns=columns_to_drop, inplace=True)
dataframes = {'dfa': dfa, 'dfw': dfw, 'df1': df1, 'df3': df3}
for name, df in dataframes.items():
print(f"Data types of DataFrame {name}:")
print(df.dtypes)
print()
dfw.dtypes
import pandas as pd
# Merge the dataframes based on the common columns 'District Name' and 'District Code'
merged_df_o = pd.merge(dfw, dfa, on=['District Name', 'District Code'], how='outer')
merged_df_o.dtypes
# rename_columns = {
# 'Sum_K-3_x': 'Sum_K-3',
# 'Sum_04-08_x': 'Sum_04-08',
# 'Sum_09-12_x': 'Sum_09-12',
# 'Sum_K-3_y': 'Sum_K-3',
# 'Sum_04-08_y': 'Sum_04-08',
# 'Sum_09-12_y': 'Sum_09-12',
# 'Total Students_x': 'Total Students',
# 'All Grades_x': 'All Grades',
# 'Total Students_y': 'Total Students',
# 'All Grades_y': 'All Grades',
# }
# merged_df_o = merged_df_o.rename(columns=rename_columns)
merged_df_o
merged_df_o.dtypes
merged_df_o['All Grades'] = merged_df_o['All Grades_x'] + merged_df_o['All Grades_y']
merged_df_o['Total Students'] = merged_df_o['Total Students_x'] + merged_df_o['Total Students_y']
merged_df_o['Primary'] = merged_df_o['Primary_x'] + merged_df_o['Primary_y']
merged_df_o['Secondary'] = merged_df_o['Secondary_x'] + merged_df_o['Secondary_y']
merged_df_o['High'] = merged_df_o['High_x'] + merged_df_o['High_y']
merged_df_o.drop(['All Grades_x', 'All Grades_y', 'Total Students_x', 'Total Students_y', 'Primary_x', 'Primary_y', 'Secondary_x', 'Secondary_y', 'High_x', 'High_y'], axis=1, inplace=True)
merged_df_o
merged_df_o['Others-%'] = merged_df_o['All Grades'] * 100 / merged_df_o['Total Students']
merged_df_o.dtypes
merged_df_o.fillna(0, inplace=True)
!ls
merged_df_o.to_excel('merged_df_others_cs-class.xlsx', index=False)
nan_values = merged_df_o.isnull().sum()
print(nan_values)
len(dfw_ap)
len(dfa_ap)
dfw_ap.dtypes
dfa_ap.dtypes
merged_df_o_ap = pd.merge(dfw_ap, dfa_ap, on=['District Name', 'District Code'], how='outer')
merged_df_o_ap.dtypes
merged_df_o_ap['Tests Taken'] = merged_df_o_ap['Tests Taken_x'] + merged_df_o_ap['Tests Taken_y']
merged_df_o_ap['Score=1'] = merged_df_o_ap['Score=1_x'] + merged_df_o_ap['Score=1_y']
merged_df_o_ap['Score=2'] = merged_df_o_ap['Score=2_x'] + merged_df_o_ap['Score=2_y']
merged_df_o_ap['Score=3'] = merged_df_o_ap['Score=3_x'] + merged_df_o_ap['Score=3_y']
merged_df_o_ap['Score=4'] = merged_df_o_ap['Score=4_x'] + merged_df_o_ap['Score=4_y']
merged_df_o_ap['Score=5'] = merged_df_o_ap['Score=5_x'] + merged_df_o_ap['Score=5_y']
merged_df_o_ap['Tests Taken'] = merged_df_o_ap['Tests Taken_x'] + merged_df_o_ap['Tests Taken_y']
merged_df_o_ap['Score=1'] = merged_df_o_ap['Score=1_x'] + merged_df_o_ap['Score=1_y']
merged_df_o_ap['Score=2'] = merged_df_o_ap['Score=2_x'] + merged_df_o_ap['Score=2_y']
merged_df_o_ap['Score=3'] = merged_df_o_ap['Score=3_x'] + merged_df_o_ap['Score=3_y']
merged_df_o_ap['Score=4'] = merged_df_o_ap['Score=4_x'] + merged_df_o_ap['Score=4_y']
merged_df_o_ap['Score=5'] = merged_df_o_ap['Score=5_x'] + merged_df_o_ap['Score=5_y']
merged_df_o_ap.drop(['Tests Taken_x', 'Tests Taken_y', 'Score=1_x', 'Score=1_y', 'Score=2_x', 'Score=2_y', 'Score=3_x', 'Score=3_y', 'Score=4_x', 'Score=4_y', 'Score=5_x', 'Score=5_y', '% Score 1-2_x', '% Score 1-2_y', '% Score 3-5_x', '% Score 3-5_y'], axis=1, inplace=True)
merged_df_o_ap.dtypes
merged_df_o_ap['% Score 1-2'] = ((merged_df_o_ap['Score=1'] + merged_df_o_ap['Score=2'])/merged_df_o_ap['Tests Taken'])
merged_df_o_ap['% Score 3-5'] = ((merged_df_o_ap['Score=3'] + merged_df_o_ap['Score=4'] + merged_df_o_ap['Score=5'])/merged_df_o_ap['Tests Taken'])
merged_df_o_ap.dtypes
merged_df_o_ap.fillna(0, inplace=True)
merged_df_o_ap.drop(['Score=1', 'Score=2', 'Score=3', 'Score=4', 'Score=5'], axis=1, inplace=True)
merged_df_o_ap.dtypes
!ls
merged_df_o_ap.to_excel('merged_df_ap_performance_others.xlsx', index=False)
len(merged_df_o_ap)
len(merged_df_o)
data_frames = {'df1': df1, 'df1_ap': df1_ap, 'df3': df3, 'df3_ap': df3_ap}
for df_name, df in data_frames.items():
unique_values = df['District Code'].nunique()
print("Number of unique values in 'District Code' for", df_name, ":", unique_values)
df1_ap.dtypes
# df3_ap.fillna(0, inplace=True)
# df1.isnull().sum() # column wise null check
df3_ap.isnull().sum()
df3_ap.dtypes
columns_to_drop = ['Score=1', 'Score=2', 'Score=3', 'Score=4', 'Score=5']
df1_ap.drop(columns=columns_to_drop, inplace=True)
df3_ap.drop(columns=columns_to_drop, inplace=True)
df1_ap.dtypes
df3_ap.dtypes
len(merged_df_o_ap)
len(merged_df_o)
merged_df_others_o_ap = merged_df_o.merge(merged_df_o_ap, on='District Code', how='inner')
len(df1)
len(df1_ap)
len(df3)
len(df3_ap)
# Join 1: inner
merged_df1_i = df1.merge(df1_ap, on='District Code', how='inner')
merged_df3_i = df3.merge(df3_ap, on='District Code', how='inner')
merged_df3_i.head(5)
merged_df1_i.head(5)
merged_df_o_ap.head(5)
merged_df_o.head(5)
merged_df1_l = df1.merge(df1_ap, on='District Code', how='left')
merged_df3_l = df3.merge(df3_ap, on='District Code', how='left')
merged_df_others_o_ap_left = merged_df_o.merge(merged_df_o_ap, on='District Code', how='inner')
len(merged_df_others_o_ap_left)
len(merged_df1_l)
len(merged_df3_l)
merged_df_others_o_ap_left
merged_df_others_o_ap.fillna(0, inplace=True)
merged_df1_l.fillna(0, inplace=True)
merged_df3_l.fillna(0, inplace=True)
merged_df1_l.isnull().sum()
merged_df3_l.isnull().sum()
merged_df_others_o_ap.isnull().sum()
merged_df3_l.head(5)
merged_df_others_o_ap.head(5)
merged_df1_l.head(5)
merged_df3_l.head(5)
!ls
merged_df1_i.dtypes
!ls
merged_df1_i.to_excel('merged_df_Afric-American.xlsx', index=False)
merged_df3_i.to_excel('merged_df_Hispanic-Latino.xlsx', index=False)
merged_df_others_o_ap.to_excel('merged_df_Others-(Asian+White).xlsx', index=False)
import matplotlib.pyplot as plt
scatter_data = merged_df2[['Sum_K-3', 'Sum_04-08', 'Sum_09-12', 'Tests Taken']]
# Creating the scatter plot
plt.scatter(scatter_data['Tests Taken'], scatter_data['Sum_K-3'], label='Sum_K-3')
plt.scatter(scatter_data['Tests Taken'], scatter_data['Sum_04-08'], label='Sum_04-08')
plt.scatter(scatter_data['Tests Taken'], scatter_data['Sum_09-12'], label='Sum_09-12')
# Adding labels and legend
plt.xlabel('Tests Taken')
plt.ylabel('Sum Values')
plt.legend()
# Displaying the plot
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
# Assuming you already have the DataFrame 'merged_df2'
# Extracting the required columns
scatter_data = merged_df2[['Sum_K-3', 'Sum_04-08', 'Sum_09-12', 'Tests Taken']]
# Creating the scatter plot
plt.scatter(scatter_data['Tests Taken'], scatter_data['Sum_K-3'], label='Sum_K-3', alpha=0.5)
plt.scatter(scatter_data['Tests Taken'], scatter_data['Sum_04-08'], label='Sum_04-08', alpha=0.5)
plt.scatter(scatter_data['Tests Taken'], scatter_data['Sum_09-12'], label='Sum_09-12', alpha=0.5)
# Adjusting x-axis scale
plt.xlim(0, 200)
# Adding labels, legend, and gridlines
plt.xlabel('Tests Taken')
plt.ylabel('Sum Values')
plt.legend()
plt.grid(True)
# Displaying the plot
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
# Assuming you already have the DataFrame 'merged_df2'
# Create a 3D scatter plot
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
# Extract the required columns
x = merged_df2['Tests Taken']
y = merged_df2['Sum_K-3']
z = merged_df2['Sum_04-08']
c = merged_df2['Sum_09-12']
# Scatter plot with color-coded points based on 'Sum_09-12'
scatter = ax.scatter(x, y, z, c=c, cmap='viridis')
# Set labels and title
ax.set_xlabel('Tests Taken')
ax.set_ylabel('Sum_K-3')
ax.set_zlabel('Sum_04-08')
ax.set_title('3D Scatter Plot')
# Add a colorbar
cbar = plt.colorbar(scatter)
cbar.set_label('Sum_09-12')
# Show the plot
plt.show()
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Assuming you already have the DataFrame 'merged_df2'
# Select the required columns
heatmap_data = merged_df2[['Sum_K-3', 'Sum_04-08', 'Sum_09-12', 'Tests Taken']]
# Compute the correlation matrix
correlation_matrix = heatmap_data.corr()
# Create a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
# Set the title
plt.title('Heatmap of Correlation')
# Show the plot
plt.show()
# from pandas_profiling import ProfileReport
# # Assuming you already have the DataFrame 'merged_df2'
# # Generate the pandas profiling report
# profile = ProfileReport(merged_df2, title='Pandas Profiling Report')
# # Display the report as an interactive widget form
# profile.to_widgets()
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
# Assuming you already have the DataFrame 'merged_df2'
# Create a 3D scatter plot
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
# Extract the required columns
x = merged_df2['Tests Taken']
y = merged_df2['Sum_K-3']
z = merged_df2['Sum_04-08']
c = merged_df2['Sum_09-12']
labels = merged_df2['District Name_x']
# Scatter plot with color-coded points based on 'Sum_09-12'
scatter = ax.scatter(x, y, z, c=c, cmap='viridis')
# Add labels to data points
for i, label in enumerate(labels):
ax.text(x[i], y[i], z[i], label, color='black', fontsize=8, ha='center', va='center')
# Set labels and title
ax.set_xlabel('Tests Taken')
ax.set_ylabel('Sum_K-3')
ax.set_zlabel('Sum_04-08')
ax.set_title('3D Scatter Plot')
# Add a colorbar
cbar = plt.colorbar(scatter)
cbar.set_label('Sum_09-12')
# Show the plot
plt.show()
merged_df1_i.dtypes
merged_df_others_o_ap.dtypes
# column_index_to_drop = 7 # Specify the index of the column you want to drop
# merged_df_others_o_ap.drop(merged_df_others_o_ap.columns[column_index_to_drop], axis=1, inplace=True)
# columns_to_drop = ['K', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
# merged_df1_i.drop(columns=columns_to_drop, inplace=True)
# columns_to_drop = ['K', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
# merged_df3_i.drop(columns=columns_to_drop, inplace=True)
merged_df1_i.dtypes
merged_df3_i.dtypes
merged_df3_i.dtypes
merged_df1_i.dtypes
# merged_df1_i = merged_df1_i.drop(['CS course Binary for Sum_K-3', 'CS course Binary for Sum_04-08', 'CS course Binary for Sum_09-12'], axis=1)
!pip install -U ydata-profiling
!ls
from ydata_profiling import ProfileReport
profile1 = ProfileReport(merged_df1_i, title="District-wise Profiling Report of African American/Black CS Course Takers", minimal= False)
profile1.to_file("District-wise_African_American_CS_Course_Takers_Profiling_Report.html")
profile2 = ProfileReport(merged_df3_i, title="District-wise Profiling Report of Hispanic/Latino CS Course Takers", minimal= False)
profile2.to_file("District-wise_Hispanic_Latino_CS_Course_Takers_Profiling_Report.html")
profile3 = ProfileReport(merged_df_others_o_ap, title="District-wise Profiling Report of Others(Asian & White) CS Course Takers", minimal= False)
profile3.to_file("District-wise_Others_CS_Course_Takers_Profiling_Report.html")
!ls
from PIL import Image
import matplotlib.pyplot as plt
img = Image.open('image.jpg')
plt.imshow(img)
plt.title('Image Title')
Afri-Amer-Black.png Hispanic-Latino.png Others-Asian-White.png
from PIL import Image
import matplotlib.pyplot as plt
img1 = Image.open('Afri-Amer-Black.png')
plt.imshow(img1)
plt.title('African American/Black CS Course Takers')
plt.show()
Take Aways from Above Heatmap of African-American/Black:
- High School has high Correlation with All grades
merged_df1_i.dtypes
import matplotlib.pyplot as plt
# Filtering 'Secondary' values to be within the range 0-100
filtered_merged_df1_i = merged_df1_i[(merged_df1_i['Secondary'] >= 0) & (merged_df1_i['Secondary'] <= 100)]
# Scatter plot
plt.scatter(filtered_merged_df1_i['Secondary'], filtered_merged_df1_i['% Score 3-5'])
plt.xlabel('Secondary')
plt.ylabel('% Score 3-5')
plt.title('Scatter Plot: % Score 3-5 vs. Secondary')
plt.grid(True)
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
# Filtering 'Secondary' values to be within the range 0-100
filtered_merged_df1_i = merged_df1_i[(merged_df1_i['Secondary'] >= 0) & (merged_df1_i['Secondary'] <= 100)]
# Scatter plot using Seaborn
sns.scatterplot(x='Secondary', y='% Score 3-5', data=filtered_merged_df1_i)
plt.xlabel('Secondary')
plt.ylabel('% Score 3-5')
plt.title('Scatter Plot: Secondary vs % Score 3-5')
plt.grid(True)
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
# Filtering 'High' values to be within the range 0-100
filtered_merged_df1_i = merged_df1_i[(merged_df1_i['High'] >= 0) & (merged_df1_i['High'] <= 100)]
# Scatter plot using Seaborn
sns.scatterplot(x='High', y='% Score 3-5', data=filtered_merged_df1_i)
plt.xlabel('High')
plt.ylabel('% Score 3-5')
plt.title('Scatter Plot: High vs % Score 3-5')
plt.grid(True)
plt.show()
merged_df1_i.dtypes
import seaborn as sns
import matplotlib.pyplot as plt
# Filtering 'High' values to be within the range 0-200
filtered_merged_df1_i = merged_df1_i[(merged_df1_i['High'] >= 0) & (merged_df1_i['High'] <= 200)]
# Scatter plot using Seaborn
sns.scatterplot(x='High', y='All Grades', data=filtered_merged_df1_i)
plt.xlabel('High')
plt.ylabel('All Grades')
plt.title('Scatter Plot: High vs All Grades')
plt.grid(True)
plt.show()
The Above Scatter Plot Showcases Clear Correlation between High School and All Grades indicating this Ethnic Group contribution to ALL grades is higher in later classes than primary or secondary
merged_df1_i.dtypes
import seaborn as sns
import matplotlib.pyplot as plt
# Scatter plot using Seaborn
sns.scatterplot(x='AfriAmerican-Black-%', y='% Score 3-5', data=merged_df1_i)
plt.xlabel('AfriAmerican-Black-%')
plt.ylabel('% Score 3-5')
plt.title('Scatter Plot: AfriAmerican-Black-% vs Passing Percentage')
plt.grid(True)
plt.show()
from PIL import Image
import matplotlib.pyplot as plt
img2 = Image.open('Hispanic-Latino.png')
plt.imshow(img2)
plt.title('Hispanic/Latino CS Course Takers')
plt.show()