!ls
African-American-Box Plot(04-08).png Other Ethinic Groups
CS682-TeamD.pptx                     Priyank-DDFG-Work.ipynb
Ethinicities-Profiling-Reports       datasets
Old-ddfg-work.ipynb                  yui.pptx
import pandas as pd
df1 = pd.read_excel("datasets/AfricAmerican-Black/artcourse-afrc-amer-cs.xlsx", header=1)
df3 = pd.read_excel("datasets/Hispanic-Latino/artcourse-hisplat-cs.xlsx", header=1)
df1.head(5)
District Name District Code K 01 02 03 04 05 06 07 08 09 10 11 12 All Grades Total Students
0 Abby Kelley Foster Charter Public (District) 4450000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 775
1 Abington 10000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1 111
2 Academy Of the Pacific Rim Charter Public (Dis... 4120000 NaN NaN NaN NaN NaN 0.0 0.0 0.0 0.0 0.0 0.0 6.0 8.0 14 307
3 Acton-Boxborough 6000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 153
4 Acushnet 30000 0.0 NaN 0.0 0.0 0.0 0.0 NaN NaN 0.0 NaN NaN NaN NaN 0 12
df3.head(5)
District Name District Code K 01 02 03 04 05 06 07 08 09 10 11 12 All Grades Total Students
0 Abby Kelley Foster Charter Public (District) 4450000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0.0 0.0 0.0 0 293
1 Abington 10000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0.0 1.0 0.0 1 281
2 Academy Of the Pacific Rim Charter Public (Dis... 4120000 NaN NaN NaN NaN NaN 0.0 0.0 0.0 0.0 0 0.0 4.0 4.0 8 144
3 Acton-Boxborough 6000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1 0.0 0.0 2.0 3 379
4 Acushnet 30000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN NaN NaN NaN 0 42
import pandas as pd
df1_ap = pd.read_excel("datasets/AfricAmerican-Black/ap_performance_afr_amerc.xlsx", header=1)
df3_ap = pd.read_excel("datasets/Hispanic-Latino/ap_performance_hisp_latino.xlsx", header=1)
df1_ap.head(5)
District Name District Code Tests Taken Score=1 Score=2 Score=3 Score=4 Score=5 % Score 1-2 % Score 3-5
0 Abington 10000 1 NaN NaN NaN NaN NaN NaN NaN
1 Academy Of the Pacific Rim Charter Public (Dis... 4120000 7 NaN NaN NaN NaN NaN NaN NaN
2 Advanced Math and Science Academy Charter (Dis... 4300000 3 NaN NaN NaN NaN NaN NaN NaN
3 Amherst-Pelham 6050000 2 NaN NaN NaN NaN NaN NaN NaN
4 Andover 90000 2 NaN NaN NaN NaN NaN NaN NaN
df3_ap.head(5)
District Name District Code Tests Taken Score=1 Score=2 Score=3 Score=4 Score=5 % Score 1-2 % Score 3-5
0 Abington 10000 2 NaN NaN NaN NaN NaN NaN NaN
1 Academy Of the Pacific Rim Charter Public (Dis... 4120000 5 NaN NaN NaN NaN NaN NaN NaN
2 Acton-Boxborough 6000000 4 NaN NaN NaN NaN NaN NaN NaN
3 Advanced Math and Science Academy Charter (Dis... 4300000 3 NaN NaN NaN NaN NaN NaN NaN
4 Amesbury 70000 1 NaN NaN NaN NaN NaN NaN NaN
import pandas as pd
dfw = pd.read_excel("datasets/Others-(Asian + White)/artcourse-white.xlsx", header=1)
dfa = pd.read_excel("datasets/Others-(Asian + White)/artcourse-asian.xlsx", header=1)
import pandas as pd
dfw_ap = pd.read_excel("datasets/Others-(Asian + White)/ap_performance_white.xlsx", header=1)
dfa_ap = pd.read_excel("datasets/Others-(Asian + White)/ap_performance_asian.xlsx", header=1)
print("Data types of dfw_ap:")
print(dfw_ap.dtypes)
print("\nLength of dfw_ap:", len(dfw_ap))

print("\nData types of dfa_ap:")
print(dfa_ap.dtypes)
print("\nLength of dfa_ap:", len(dfa_ap))


print("\nData types of dfw:")
print(dfw.dtypes)
print("\nLength of dfw:", len(dfw))

print("\nData types of dfa:")
print(dfa.dtypes)
print("\nLength of dfa:", len(dfa))
Data types of dfw_ap:
District Name     object
District Code      int64
Tests Taken       object
Score=1           object
Score=2           object
Score=3           object
Score=4           object
Score=5           object
% Score 1-2      float64
% Score 3-5      float64
dtype: object

Length of dfw_ap: 273

Data types of dfa_ap:
District Name     object
District Code      int64
Tests Taken       object
Score=1          float64
Score=2          float64
Score=3          float64
Score=4           object
Score=5           object
% Score 1-2      float64
% Score 3-5      float64
dtype: object

Length of dfa_ap: 220

Data types of dfw:
District Name      object
District Code       int64
K                  object
01                 object
02                 object
03                 object
04                 object
05                 object
06                 object
07                 object
08                 object
09                 object
10                float64
11                float64
12                float64
All Grades         object
Total Students     object
dtype: object

Length of dfw: 399

Data types of dfa:
District Name      object
District Code       int64
K                 float64
01                float64
02                float64
03                float64
04                float64
05                 object
06                 object
07                 object
08                 object
09                float64
10                float64
11                float64
12                float64
All Grades         object
Total Students     object
dtype: object

Length of dfa: 370
# dfw & dfw_ap 
columns = ['K', '01', '02', '03', '04', '05', '06', '07', '08', '09', 'All Grades', 'Total Students']

for column in columns:
    dfw[column] = dfw[column].str.replace(',', '').str.strip()
    dfw[column] = dfw[column].astype(float)

columns = ['Tests Taken', 'Score=1', 'Score=2', 'Score=3', 'Score=4', 'Score=5']
for column in columns:
    dfw_ap[column] = dfw_ap[column].str.replace(',', '').str.strip()
    dfw_ap[column] = dfw_ap[column].astype(float)

    

# dfa & dfa_ap
columns = ['05', '06', '07', '08', 'All Grades', 'Total Students']
for column in columns:
    dfa[column] = dfa[column].str.replace(',', '').str.strip()
    dfa[column] = dfa[column].astype(float)
    
columns = ['Tests Taken', 'Score=4', 'Score=5']

for column in columns:
    dfa_ap[column] = dfa_ap[column].str.replace(',', '').str.strip()
    dfa_ap[column] = dfa_ap[column].astype(float)
# Removing commas and whitespaces from 'Tests Taken' column


df1['All Grades'] = df1['All Grades'].str.replace(',', '').str.strip()

# Casting 'Tests Taken' column to float
df1['All Grades'] = df1['All Grades'].astype(float)


df1['Total Students'] = df1['Total Students'].str.replace(',', '').str.strip()

# Casting 'Tests Taken' column to float
df1['Total Students'] = df1['Total Students'].astype(float)



df1_ap['Tests Taken'] = df1_ap['Tests Taken'].str.replace(',', '').str.strip()

# Casting 'Tests Taken' column to float
df1_ap['Tests Taken'] = df1_ap['Tests Taken'].astype(float)



df3_ap['Tests Taken'] = df3_ap['Tests Taken'].str.replace(',', '').str.strip()

# Casting 'Tests Taken' column to float
df3_ap['Tests Taken'] = df3_ap['Tests Taken'].astype(float)



df3['All Grades'] = df3['All Grades'].str.replace(',', '').str.strip()

# Casting 'Tests Taken' column to float
df3['All Grades'] = df3['All Grades'].astype(float)



df3['Total Students'] = df3['Total Students'].str.replace(',', '').str.strip()

# Casting 'Tests Taken' column to float
df3['Total Students'] = df3['Total Students'].astype(float)


df3['09'] = df3['09'].str.replace(',', '').str.strip()

# Casting 'Tests Taken' column to float
df3['09'] = df3['09'].astype(float)
print(df1.dtypes, df3.dtypes, df1_ap.dtypes, df3_ap.dtypes)
District Name      object
District Code       int64
K                 float64
01                float64
02                float64
03                float64
04                float64
05                float64
06                float64
07                float64
08                float64
09                float64
10                float64
11                float64
12                float64
All Grades        float64
Total Students    float64
dtype: object District Name      object
District Code       int64
K                 float64
01                float64
02                float64
03                float64
04                float64
05                float64
06                float64
07                float64
08                float64
09                float64
10                float64
11                float64
12                float64
All Grades        float64
Total Students    float64
dtype: object District Name     object
District Code      int64
Tests Taken      float64
Score=1          float64
Score=2          float64
Score=3          float64
Score=4          float64
Score=5          float64
% Score 1-2      float64
% Score 3-5      float64
dtype: object District Name     object
District Code      int64
Tests Taken      float64
Score=1          float64
Score=2          float64
Score=3          float64
Score=4          float64
Score=5          float64
% Score 1-2      float64
% Score 3-5      float64
dtype: object
print(dfw.dtypes, dfw_ap.dtypes, dfa.dtypes, dfa_ap.dtypes)
District Name      object
District Code       int64
K                 float64
01                float64
02                float64
03                float64
04                float64
05                float64
06                float64
07                float64
08                float64
09                float64
10                float64
11                float64
12                float64
All Grades        float64
Total Students    float64
dtype: object District Name     object
District Code      int64
Tests Taken      float64
Score=1          float64
Score=2          float64
Score=3          float64
Score=4          float64
Score=5          float64
% Score 1-2      float64
% Score 3-5      float64
dtype: object District Name      object
District Code       int64
K                 float64
01                float64
02                float64
03                float64
04                float64
05                float64
06                float64
07                float64
08                float64
09                float64
10                float64
11                float64
12                float64
All Grades        float64
Total Students    float64
dtype: object District Name     object
District Code      int64
Tests Taken      float64
Score=1          float64
Score=2          float64
Score=3          float64
Score=4          float64
Score=5          float64
% Score 1-2      float64
% Score 3-5      float64
dtype: object
print(df1.isnull().sum(),  # column wise null check
df3.isnull().sum()) # column wise null check
District Name       0
District Code       0
K                 147
01                145
02                150
03                134
04                140
05                134
06                124
07                120
08                118
09                109
10                114
11                115
12                116
All Grades         43
Total Students      0
dtype: int64 District Name       0
District Code       0
K                 107
01                106
02                109
03                103
04                100
05                 98
06                 87
07                101
08                100
09                 98
10                 98
11                 97
12                 98
All Grades         16
Total Students      0
dtype: int64
df1_ap.isnull().sum(),  # column wise null check
df3_ap.isnull().sum()#
District Name      0
District Code      0
Tests Taken        0
Score=1          182
Score=2          182
Score=3          182
Score=4          182
Score=5          182
% Score 1-2      182
% Score 3-5      182
dtype: int64
df3.dtypes
District Name         object
District Code          int64
K                    float64
01                   float64
02                   float64
03                   float64
04                   float64
05                   float64
06                   float64
07                   float64
08                   float64
09                   float64
10                   float64
11                   float64
12                   float64
All Grades           float64
Total Students       float64
Hispanic-Latino-%    float64
dtype: object
df1['AfriAmerican-Black-%'] = df1['All Grades'] * 100 / df1['Total Students']
import pandas as pd
import matplotlib.pyplot as plt

sorted_df = df1.sort_values(by='AfriAmerican-Black-%', ascending= False)
top_n = 40
top_districts = sorted_df.head(top_n)
plt.figure(figsize=(10, 12))  # Increase the figure size to show more districts
plt.barh(top_districts['District Name'], top_districts['AfriAmerican-Black-%'])
plt.xlabel('AfriAmerican-Black-%')
plt.ylabel('District Name')
plt.title(f'Top {top_n} Districts with Highest African-American Percentage')
plt.show()
 
df3['Hispanic-Latino-%'] = df3['All Grades'] * 100 / df3['Total Students']
import pandas as pd
import matplotlib.pyplot as plt

sorted_df = df3.sort_values(by='Hispanic-Latino-%', ascending= False)
top_n = 40
top_districts = sorted_df.head(top_n)

# Plot the top districts
plt.figure(figsize=(10, 12))  # Increase the figure size to show more districts
plt.barh(top_districts['District Name'], top_districts['Hispanic-Latino-%'])
plt.xlabel('Hispanic-Latino-%')
plt.ylabel('District Name')
plt.title(f'Top {top_n} Districts with Highest Hispanic-Latino Percentage')
plt.show()

Filling Missing Values

df1_ap.fillna(0, inplace=True)
df3_ap.fillna(0, inplace=True)

df1.fillna(0, inplace=True)
df3.fillna(0, inplace=True)
dfa.fillna(0, inplace=True)
dfw.fillna(0, inplace=True)
dfa_ap.fillna(0, inplace=True)
dfw_ap.fillna(0, inplace=True)
num_zeros1 = (df1_ap == 0).sum().sum()
num_zeros3 = (df3_ap == 0).sum().sum()
num_zeros1, num_zeros3
(1016, 1311)
data_frames = [df1_ap, df3_ap, df1, df3, dfw, dfw_ap, dfa, dfa_ap]

for i in range(4):
    print(data_frames[i].describe())
    print("-------------------")
       District Code  Tests Taken     Score=1    Score=2     Score=3  \
count   1.600000e+02   160.000000  160.000000  160.00000  160.000000   
mean    3.724938e+06    14.250000    6.125000    2.53125    1.850000   
std     4.387719e+06    92.490829   45.990531   19.99400   15.460723   
min     0.000000e+00     1.000000    0.000000    0.00000    0.000000   
25%     1.242500e+06     1.000000    0.000000    0.00000    0.000000   
50%     2.725000e+06     2.000000    0.000000    0.00000    0.000000   
75%     4.872500e+06     5.000000    0.000000    0.00000    0.000000   
max     3.506000e+07  1140.000000  559.000000  246.00000  191.000000   

          Score=4    Score=5  % Score 1-2  % Score 3-5  
count  160.000000  160.00000   160.000000   160.000000  
mean     0.881250    0.45000     9.230625     2.644375  
std      7.441488    4.16605    26.015796     9.631093  
min      0.000000    0.00000     0.000000     0.000000  
25%      0.000000    0.00000     0.000000     0.000000  
50%      0.000000    0.00000     0.000000     0.000000  
75%      0.000000    0.00000     0.000000     0.000000  
max     92.000000   52.00000   100.000000    61.100000  
-------------------
       District Code  Tests Taken     Score=1     Score=2     Score=3  \
count   2.170000e+02   217.000000  217.000000  217.000000  217.000000   
mean    3.635253e+06    16.700461    6.797235    2.525346    2.308756   
std     4.097033e+06   124.348516   56.804289   22.404541   21.474825   
min     0.000000e+00     1.000000    0.000000    0.000000    0.000000   
25%     1.410000e+06     2.000000    0.000000    0.000000    0.000000   
50%     2.620000e+06     3.000000    0.000000    0.000000    0.000000   
75%     4.910000e+06     7.000000    0.000000    0.000000    0.000000   
max     3.902000e+07  1812.000000  815.000000  326.000000  313.000000   

          Score=4     Score=5  % Score 1-2  % Score 3-5  
count  217.000000  217.000000   217.000000   217.000000  
mean     1.405530    1.041475    10.429493     5.699539  
std     13.945382   10.556863    26.152809    16.894585  
min      0.000000    0.000000     0.000000     0.000000  
25%      0.000000    0.000000     0.000000     0.000000  
50%      0.000000    0.000000     0.000000     0.000000  
75%      0.000000    0.000000     0.000000     0.000000  
max    204.000000  154.000000   100.000000    94.700000  
-------------------
       District Code      K     01          02          03          04  \
count   3.840000e+02  384.0  384.0  384.000000  384.000000  384.000000   
mean    4.785052e+06    0.0    0.0    0.052083    0.119792    0.052083   
std     6.699113e+06    0.0    0.0    0.593609    1.377258    0.660244   
min     0.000000e+00    0.0    0.0    0.000000    0.000000    0.000000   
25%     1.597500e+06    0.0    0.0    0.000000    0.000000    0.000000   
50%     3.045000e+06    0.0    0.0    0.000000    0.000000    0.000000   
75%     6.035000e+06    0.0    0.0    0.000000    0.000000    0.000000   
max     3.902000e+07    0.0    0.0   10.000000   23.000000   10.000000   

               05          06          07          08          09         10  \
count  384.000000  384.000000  384.000000  384.000000  384.000000  384.00000   
mean     0.109375    0.442708    1.205729    0.932292    2.536458    2.93750   
std      1.210708    4.585349   12.956520    9.332870   25.555524   29.77963   
min      0.000000    0.000000    0.000000    0.000000    0.000000    0.00000   
25%      0.000000    0.000000    0.000000    0.000000    0.000000    0.00000   
50%      0.000000    0.000000    0.000000    0.000000    0.000000    0.00000   
75%      0.000000    0.000000    0.000000    0.000000    1.000000    0.00000   
max     21.000000   85.000000  232.000000  179.000000  487.000000  564.00000   

               11          12   All Grades  Total Students  
count  384.000000  384.000000   384.000000      384.000000  
mean     2.901042    3.515625    14.804688      433.343750  
std     29.245081   35.128927   147.487242     4309.133324  
min      0.000000    0.000000     0.000000        1.000000  
25%      0.000000    0.000000     0.000000       14.000000  
50%      0.000000    0.000000     1.000000       47.500000  
75%      1.000000    1.000000     5.250000      165.500000  
max    557.000000  675.000000  2843.000000    82946.000000  
-------------------
       District Code      K     01          02          03          04  \
count   3.980000e+02  398.0  398.0  398.000000  398.000000  398.000000   
mean    4.740578e+06    0.0    0.0    0.718593    0.919598    0.628141   
std     6.595715e+06    0.0    0.0    8.400624   10.699399    8.106496   
min     0.000000e+00    0.0    0.0    0.000000    0.000000    0.000000   
25%     1.602500e+06    0.0    0.0    0.000000    0.000000    0.000000   
50%     3.045000e+06    0.0    0.0    0.000000    0.000000    0.000000   
75%     6.022500e+06    0.0    0.0    0.000000    0.000000    0.000000   
max     3.902000e+07    0.0    0.0  143.000000  183.000000  125.000000   

               05          06          07          08           09  \
count  398.000000  398.000000  398.000000  398.000000   398.000000   
mean     0.889447    1.226131    2.266332    3.226131     5.665829   
std     10.493507   12.808771   23.325199   34.282338    57.161413   
min      0.000000    0.000000    0.000000    0.000000     0.000000   
25%      0.000000    0.000000    0.000000    0.000000     0.000000   
50%      0.000000    0.000000    0.000000    0.000000     0.000000   
75%      0.000000    0.000000    0.000000    0.000000     2.000000   
max    177.000000  244.000000  451.000000  642.000000  1127.000000   

               10          11          12   All Grades  Total Students  
count  398.000000  398.000000  398.000000   398.000000      398.000000  
mean     4.959799    4.972362    4.969849    30.442211     1059.741206  
std     50.286969   50.409574   50.345547   306.624370    10648.608827  
min      0.000000    0.000000    0.000000     0.000000        1.000000  
25%      0.000000    0.000000    0.000000     0.000000       54.000000  
50%      0.000000    0.000000    0.000000     2.000000      139.000000  
75%      2.000000    2.000000    2.000000    10.000000      309.000000  
max    987.000000  990.000000  989.000000  6058.000000   210254.000000  
-------------------

Required Binning of atleast 1 CS Class Participation into 3 Bins

# Primary: ['K', '01', '02', '03]
# Secondary, Sum_04-08: ['04', '05','06','07','08']
# High: ['09', '10','11','12']
df1.dtypes
District Name            object
District Code             int64
K                       float64
01                      float64
02                      float64
03                      float64
04                      float64
05                      float64
06                      float64
07                      float64
08                      float64
09                      float64
10                      float64
11                      float64
12                      float64
All Grades              float64
Total Students          float64
AfriAmerican-Black-%    float64
dtype: object
df1_ap.dtypes
District Name     object
District Code      int64
Tests Taken      float64
Score=1          float64
Score=2          float64
Score=3          float64
Score=4          float64
Score=5          float64
% Score 1-2      float64
% Score 3-5      float64
dtype: object
df3.dtypes
District Name         object
District Code          int64
K                    float64
01                   float64
02                   float64
03                   float64
04                   float64
05                   float64
06                   float64
07                   float64
08                   float64
09                   float64
10                   float64
11                   float64
12                   float64
All Grades           float64
Total Students       float64
Hispanic-Latino-%    float64
dtype: object
selected_columns1 = ['K', '01', '02', '03']
df1['Primary'] = df1[selected_columns1].sum(axis=1)
df3['Primary'] = df3[selected_columns1].sum(axis=1)
dfa['Primary'] = dfa[selected_columns1].sum(axis=1)
dfw['Primary'] = dfw[selected_columns1].sum(axis=1)
df1['Primary'].describe()
count    384.000000
mean       0.171875
std        1.882069
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max       33.000000
Name: Primary, dtype: float64
selected_columns2 = ['04', '05','06','07','08']

# Computing the sum along columns axis for each row
df1['Secondary'] = df1[selected_columns2].sum(axis=1)
df3['Secondary'] = df3[selected_columns2].sum(axis=1)
dfa['Secondary'] = dfa[selected_columns2].sum(axis=1)
dfw['Secondary'] = dfw[selected_columns2].sum(axis=1)
selected_columns3 = ['09', '10','11','12']
# Computing the sum along columns axis for each row

df1['High'] = df1[selected_columns3].sum(axis=1)
df3['High'] = df3[selected_columns3].sum(axis=1)
dfa['High'] = dfa[selected_columns3].sum(axis=1)
dfw['High'] = dfw[selected_columns3].sum(axis=1)
df3.isnull().sum() # column wise null check
District Name        0
District Code        0
K                    0
01                   0
02                   0
03                   0
04                   0
05                   0
06                   0
07                   0
08                   0
09                   0
10                   0
11                   0
12                   0
All Grades           0
Total Students       0
Hispanic-Latino-%    0
Primary              0
Secondary            0
High                 0
dtype: int64
columns_to_drop = ['K', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
df1.drop(columns=columns_to_drop, inplace=True)

columns_to_drop = ['K', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
df3.drop(columns=columns_to_drop, inplace=True)
columns_to_drop = ['K', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
dfa.drop(columns=columns_to_drop, inplace=True)

columns_to_drop = ['K', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
dfw.drop(columns=columns_to_drop, inplace=True)
dataframes = {'dfa': dfa, 'dfw': dfw, 'df1': df1, 'df3': df3}
for name, df in dataframes.items():
    print(f"Data types of DataFrame {name}:")
    print(df.dtypes)
    print()
Data types of DataFrame dfa:
District Name      object
District Code       int64
All Grades        float64
Total Students    float64
Primary           float64
Secondary         float64
High              float64
dtype: object

Data types of DataFrame dfw:
District Name      object
District Code       int64
All Grades        float64
Total Students    float64
Primary           float64
Secondary         float64
High              float64
dtype: object

Data types of DataFrame df1:
District Name            object
District Code             int64
All Grades              float64
Total Students          float64
AfriAmerican-Black-%    float64
Primary                 float64
Secondary               float64
High                    float64
dtype: object

Data types of DataFrame df3:
District Name         object
District Code          int64
All Grades           float64
Total Students       float64
Hispanic-Latino-%    float64
Primary              float64
Secondary            float64
High                 float64
dtype: object

dfw.dtypes
District Name      object
District Code       int64
All Grades        float64
Total Students    float64
Sum_K-3           float64
Sum_04-08         float64
Sum_09-12         float64
dtype: object

Creation of Others Dataframe (Asian + White)

import pandas as pd

# Merge the dataframes based on the common columns 'District Name' and 'District Code'
merged_df_o = pd.merge(dfw, dfa, on=['District Name', 'District Code'], how='outer')
merged_df_o.dtypes
District Name        object
District Code         int64
All Grades_x        float64
Total Students_x    float64
Primary_x           float64
Secondary_x         float64
High_x              float64
All Grades_y        float64
Total Students_y    float64
Primary_y           float64
Secondary_y         float64
High_y              float64
dtype: object
# rename_columns = {
#     'Sum_K-3_x': 'Sum_K-3',
#     'Sum_04-08_x': 'Sum_04-08',
#     'Sum_09-12_x': 'Sum_09-12',
#     'Sum_K-3_y': 'Sum_K-3',
#     'Sum_04-08_y': 'Sum_04-08',
#     'Sum_09-12_y': 'Sum_09-12', 
#     'Total Students_x': 'Total Students', 
#     'All Grades_x': 'All Grades', 
#     'Total Students_y': 'Total Students', 
#     'All Grades_y': 'All Grades', 
    
# }
# merged_df_o = merged_df_o.rename(columns=rename_columns)
merged_df_o
District Name District Code All Grades Total Students Sum_K-3 Sum_04-08 Sum_09-12 All Grades Total Students Sum_K-3 Sum_04-08 Sum_09-12
0 Abby Kelley Foster Charter Public (District) 4450000 102.0 226.0 0.0 87.0 15.0 18.0 45.0 0.0 17.0 1.0
1 Abington 10000 251.0 1607.0 0.0 251.0 0.0 8.0 51.0 0.0 8.0 0.0
2 Academy Of the Pacific Rim Charter Public (Dis... 4120000 0.0 34.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0
3 Acton-Boxborough 6000000 412.0 2616.0 0.0 400.0 12.0 310.0 1706.0 0.0 286.0 24.0
4 Acushnet 30000 786.0 786.0 338.0 448.0 0.0 8.0 8.0 4.0 4.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ...
394 Woburn 3470000 570.0 2837.0 0.0 550.0 20.0 44.0 276.0 0.0 42.0 2.0
395 Worcester 3480000 267.0 6335.0 0.0 241.0 26.0 55.0 1462.0 0.0 43.0 12.0
396 Worthington 3490000 0.0 52.0 0.0 0.0 0.0 NaN NaN NaN NaN NaN
397 Wrentham 3500000 393.0 707.0 393.0 0.0 0.0 15.0 26.0 15.0 0.0 0.0
398 State Totals 0 109389.0 493070.0 32959.0 72627.0 3803.0 12383.0 64434.0 2660.0 9142.0 581.0

399 rows × 12 columns

merged_df_o.dtypes
District Name        object
District Code         int64
All Grades_x        float64
Total Students_x    float64
Primary_x           float64
Secondary_x         float64
High_x              float64
All Grades_y        float64
Total Students_y    float64
Primary_y           float64
Secondary_y         float64
High_y              float64
All Grades          float64
Total Students      float64
Primary             float64
Secondary           float64
High                float64
dtype: object
merged_df_o['All Grades'] = merged_df_o['All Grades_x'] + merged_df_o['All Grades_y']
merged_df_o['Total Students'] = merged_df_o['Total Students_x'] + merged_df_o['Total Students_y']
merged_df_o['Primary'] = merged_df_o['Primary_x'] + merged_df_o['Primary_y']
merged_df_o['Secondary'] = merged_df_o['Secondary_x'] + merged_df_o['Secondary_y']
merged_df_o['High'] = merged_df_o['High_x'] + merged_df_o['High_y']
merged_df_o.drop(['All Grades_x', 'All Grades_y', 'Total Students_x', 'Total Students_y', 'Primary_x', 'Primary_y', 'Secondary_x', 'Secondary_y', 'High_x', 'High_y'], axis=1, inplace=True)
merged_df_o
District Name District Code All Grades Total Students Primary Secondary High
0 Abby Kelley Foster Charter Public (District) 4450000 120.0 271.0 0.0 104.0 16.0
1 Abington 10000 259.0 1658.0 0.0 259.0 0.0
2 Academy Of the Pacific Rim Charter Public (Dis... 4120000 0.0 36.0 0.0 0.0 0.0
3 Acton-Boxborough 6000000 722.0 4322.0 0.0 686.0 36.0
4 Acushnet 30000 794.0 794.0 342.0 452.0 0.0
... ... ... ... ... ... ... ...
394 Woburn 3470000 614.0 3113.0 0.0 592.0 22.0
395 Worcester 3480000 322.0 7797.0 0.0 284.0 38.0
396 Worthington 3490000 NaN NaN NaN NaN NaN
397 Wrentham 3500000 408.0 733.0 408.0 0.0 0.0
398 State Totals 0 121772.0 557504.0 35619.0 81769.0 4384.0

399 rows × 7 columns

merged_df_o['Others-%'] = merged_df_o['All Grades'] * 100 / merged_df_o['Total Students']
merged_df_o.dtypes
District Name      object
District Code       int64
All Grades        float64
Total Students    float64
Primary           float64
Secondary         float64
High              float64
Others-%          float64
dtype: object
merged_df_o.fillna(0, inplace=True)
!ls
ap_performance_asian.xlsx
ap_performance_white.xlsx
artcourse-asian.xlsx
artcourse-white.xlsx
merged_df_ap_others_ap_performance_cleaned.xlsx
merged_df_o.to_excel('merged_df_others_cs-class.xlsx', index=False)
nan_values = merged_df_o.isnull().sum()
print(nan_values)
District Name     0
District Code     0
All Grades        0
Total Students    0
Primary           0
Secondary         0
High              0
Others-%          0
dtype: int64
len(dfw_ap)
273
len(dfa_ap)
220
dfw_ap.dtypes
District Name     object
District Code      int64
Tests Taken      float64
Score=1          float64
Score=2          float64
Score=3          float64
Score=4          float64
Score=5          float64
% Score 1-2      float64
% Score 3-5      float64
dtype: object
dfa_ap.dtypes
District Name     object
District Code      int64
Tests Taken      float64
Score=1          float64
Score=2          float64
Score=3          float64
Score=4          float64
Score=5          float64
% Score 1-2      float64
% Score 3-5      float64
dtype: object
merged_df_o_ap = pd.merge(dfw_ap, dfa_ap, on=['District Name', 'District Code'], how='outer')
merged_df_o_ap.dtypes
District Name     object
District Code      int64
Tests Taken_x    float64
Score=1_x        float64
Score=2_x        float64
Score=3_x        float64
Score=4_x        float64
Score=5_x        float64
% Score 1-2_x    float64
% Score 3-5_x    float64
Tests Taken_y    float64
Score=1_y        float64
Score=2_y        float64
Score=3_y        float64
Score=4_y        float64
Score=5_y        float64
% Score 1-2_y    float64
% Score 3-5_y    float64
dtype: object
merged_df_o_ap['Tests Taken'] = merged_df_o_ap['Tests Taken_x'] + merged_df_o_ap['Tests Taken_y']
merged_df_o_ap['Score=1'] = merged_df_o_ap['Score=1_x'] + merged_df_o_ap['Score=1_y']
merged_df_o_ap['Score=2'] = merged_df_o_ap['Score=2_x'] + merged_df_o_ap['Score=2_y']
merged_df_o_ap['Score=3'] = merged_df_o_ap['Score=3_x'] + merged_df_o_ap['Score=3_y']
merged_df_o_ap['Score=4'] = merged_df_o_ap['Score=4_x'] + merged_df_o_ap['Score=4_y']
merged_df_o_ap['Score=5'] = merged_df_o_ap['Score=5_x'] + merged_df_o_ap['Score=5_y']
merged_df_o_ap['Tests Taken'] = merged_df_o_ap['Tests Taken_x'] + merged_df_o_ap['Tests Taken_y']

merged_df_o_ap['Score=1'] = merged_df_o_ap['Score=1_x'] + merged_df_o_ap['Score=1_y']

merged_df_o_ap['Score=2'] = merged_df_o_ap['Score=2_x'] + merged_df_o_ap['Score=2_y']

merged_df_o_ap['Score=3'] = merged_df_o_ap['Score=3_x'] + merged_df_o_ap['Score=3_y']

merged_df_o_ap['Score=4'] = merged_df_o_ap['Score=4_x'] + merged_df_o_ap['Score=4_y']

merged_df_o_ap['Score=5'] = merged_df_o_ap['Score=5_x'] + merged_df_o_ap['Score=5_y']
merged_df_o_ap.drop(['Tests Taken_x', 'Tests Taken_y', 'Score=1_x', 'Score=1_y', 'Score=2_x', 'Score=2_y', 'Score=3_x', 'Score=3_y', 'Score=4_x', 'Score=4_y', 'Score=5_x', 'Score=5_y', '% Score 1-2_x', '% Score 1-2_y', '% Score 3-5_x', '% Score 3-5_y'], axis=1, inplace=True)
merged_df_o_ap.dtypes
District Name     object
District Code      int64
Tests Taken      float64
Score=1          float64
Score=2          float64
Score=3          float64
Score=4          float64
Score=5          float64
dtype: object
merged_df_o_ap['% Score 1-2'] = ((merged_df_o_ap['Score=1'] + merged_df_o_ap['Score=2'])/merged_df_o_ap['Tests Taken'])
merged_df_o_ap['% Score 3-5'] = ((merged_df_o_ap['Score=3'] + merged_df_o_ap['Score=4'] + merged_df_o_ap['Score=5'])/merged_df_o_ap['Tests Taken'])
merged_df_o_ap.dtypes
District Name     object
District Code      int64
Tests Taken      float64
Score=1          float64
Score=2          float64
Score=3          float64
Score=4          float64
Score=5          float64
% Score 1-2      float64
% Score 3-5      float64
dtype: object
merged_df_o_ap.fillna(0, inplace=True)
merged_df_o_ap.drop(['Score=1', 'Score=2', 'Score=3', 'Score=4', 'Score=5'], axis=1, inplace=True)
                                         District Name  District Code  \
0                                             Abington          10000   
1    Academy Of the Pacific Rim Charter Public (Dis...        4120000   
2                                     Acton-Boxborough        6000000   
3    Advanced Math and Science Academy Charter (Dis...        4300000   
4                                               Agawam          50000   
..                                                 ...            ...   
271                                          Worcester        3480000   
272                                       State Totals              0   
273                                            Chelsea         570000   
274     Collegiate Charter School of Lowell (District)       35030000   
275                  Sturgis Charter Public (District)        4890000   

     Tests Taken  % Score 1-2  % Score 3-5  
0           25.0     0.160000     0.680000  
1            0.0     0.000000     0.000000  
2          305.0     0.062295     0.937705  
3          135.0     0.111111     0.888889  
4           37.0     0.432432     0.459459  
..           ...          ...          ...  
271        275.0     0.552727     0.447273  
272      18046.0     0.302394     0.697606  
273          0.0     0.000000     0.000000  
274          0.0     0.000000     0.000000  
275          0.0     0.000000     0.000000  

[276 rows x 5 columns]
merged_df_o_ap.dtypes
District Name     object
District Code      int64
Tests Taken      float64
% Score 1-2      float64
% Score 3-5      float64
dtype: object
!ls
OLD_merged_df_ap_others_ap_performance_cleaned.xlsx
ap_performance_asian.xlsx
ap_performance_white.xlsx
artcourse-asian.xlsx
artcourse-white.xlsx
merged_df_others_cs-class.xlsx
merged_df_o_ap.to_excel('merged_df_ap_performance_others.xlsx', index=False)
len(merged_df_o_ap)
276
len(merged_df_o)
399
data_frames = {'df1': df1, 'df1_ap': df1_ap, 'df3': df3, 'df3_ap': df3_ap}

for df_name, df in data_frames.items():
    unique_values = df['District Code'].nunique()
    print("Number of unique values in 'District Code' for", df_name, ":", unique_values)
Number of unique values in 'District Code' for df1 : 384
Number of unique values in 'District Code' for df1_ap : 160
Number of unique values in 'District Code' for df3 : 398
Number of unique values in 'District Code' for df3_ap : 217
df1_ap.dtypes
# df3_ap.fillna(0, inplace=True)
# df1.isnull().sum() # column wise null check
District Name     object
District Code      int64
Tests Taken      float64
Score=1          float64
Score=2          float64
Score=3          float64
Score=4          float64
Score=5          float64
% Score 1-2      float64
% Score 3-5      float64
dtype: object
df3_ap.isnull().sum()
District Name    0
District Code    0
Tests Taken      0
% Score 1-2      0
% Score 3-5      0
dtype: int64
df3_ap.dtypes
District Name     object
District Code      int64
Tests Taken      float64
Score=1          float64
Score=2          float64
Score=3          float64
Score=4          float64
Score=5          float64
% Score 1-2      float64
% Score 3-5      float64
dtype: object
columns_to_drop = ['Score=1', 'Score=2', 'Score=3', 'Score=4', 'Score=5']
df1_ap.drop(columns=columns_to_drop, inplace=True)
df3_ap.drop(columns=columns_to_drop, inplace=True)
df1_ap.dtypes
District Name     object
District Code      int64
Tests Taken      float64
% Score 1-2      float64
% Score 3-5      float64
dtype: object
df3_ap.dtypes
District Name     object
District Code      int64
Tests Taken      float64
% Score 1-2      float64
% Score 3-5      float64
dtype: object

Joining of Dataframes (CS Classes Participation + AP Test Scores)

len(merged_df_o_ap)
276
len(merged_df_o)
399
merged_df_others_o_ap = merged_df_o.merge(merged_df_o_ap, on='District Code', how='inner')
len(df1)
384
len(df1_ap)
160
len(df3)
398
len(df3_ap)
217
# Join 1: inner

merged_df1_i = df1.merge(df1_ap, on='District Code', how='inner')
merged_df3_i = df3.merge(df3_ap, on='District Code', how='inner')
merged_df3_i.head(5)
District Name_x District Code All Grades Total Students Hispanic-Latino-% Primary Secondary High District Name_y Tests Taken % Score 1-2 % Score 3-5
0 Abington 10000 1.0 281.0 0.355872 0.0 0.0 1.0 Abington 2.0 0.0 0.0
1 Academy Of the Pacific Rim Charter Public (Dis... 4120000 8.0 144.0 5.555556 0.0 0.0 8.0 Academy Of the Pacific Rim Charter Public (Dis... 5.0 0.0 0.0
2 Acton-Boxborough 6000000 3.0 379.0 0.791557 0.0 0.0 3.0 Acton-Boxborough 4.0 0.0 0.0
3 Advanced Math and Science Academy Charter (Dis... 4300000 26.0 74.0 35.135135 0.0 11.0 15.0 Advanced Math and Science Academy Charter (Dis... 3.0 0.0 0.0
4 Amesbury 70000 10.0 172.0 5.813953 0.0 0.0 10.0 Amesbury 1.0 0.0 0.0
merged_df1_i.head(5)
District Name_x District Code All Grades Total Students AfriAmerican-Black-% Primary Secondary High District Name_y Tests Taken % Score 1-2 % Score 3-5
0 Abington 10000 1.0 111.0 0.900901 0.0 0.0 1.0 Abington 1.0 0.0 0.0
1 Academy Of the Pacific Rim Charter Public (Dis... 4120000 14.0 307.0 4.560261 0.0 0.0 14.0 Academy Of the Pacific Rim Charter Public (Dis... 7.0 0.0 0.0
2 Advanced Math and Science Academy Charter (Dis... 4300000 10.0 21.0 47.619048 0.0 3.0 7.0 Advanced Math and Science Academy Charter (Dis... 3.0 0.0 0.0
3 Amherst-Pelham 6050000 29.0 108.0 26.851852 0.0 0.0 29.0 Amherst-Pelham 2.0 0.0 0.0
4 Andover 90000 9.0 139.0 6.474820 0.0 0.0 9.0 Andover 2.0 0.0 0.0
merged_df_o_ap.head(5)
District Name District Code Tests Taken % Score 1-2 % Score 3-5
0 Abington 10000 25.0 0.160000 0.680000
1 Academy Of the Pacific Rim Charter Public (Dis... 4120000 0.0 0.000000 0.000000
2 Acton-Boxborough 6000000 305.0 0.062295 0.937705
3 Advanced Math and Science Academy Charter (Dis... 4300000 135.0 0.111111 0.888889
4 Agawam 50000 37.0 0.432432 0.459459
merged_df_o.head(5)
District Name District Code All Grades Total Students Primary Secondary High Others-%
0 Abby Kelley Foster Charter Public (District) 4450000 120.0 271.0 0.0 104.0 16.0 44.280443
1 Abington 10000 259.0 1658.0 0.0 259.0 0.0 15.621230
2 Academy Of the Pacific Rim Charter Public (Dis... 4120000 0.0 36.0 0.0 0.0 0.0 0.000000
3 Acton-Boxborough 6000000 722.0 4322.0 0.0 686.0 36.0 16.705229
4 Acushnet 30000 794.0 794.0 342.0 452.0 0.0 100.000000
merged_df1_l = df1.merge(df1_ap, on='District Code', how='left')
merged_df3_l = df3.merge(df3_ap, on='District Code', how='left')
merged_df_others_o_ap_left = merged_df_o.merge(merged_df_o_ap, on='District Code', how='inner')
len(merged_df_others_o_ap_left)
276
len(merged_df1_l)
384
len(merged_df3_l)
398
merged_df_others_o_ap_left
merged_df_others_o_ap.fillna(0, inplace=True)
merged_df1_l.fillna(0, inplace=True)
merged_df3_l.fillna(0, inplace=True)
merged_df1_l.isnull().sum()
District Name_x         0
District Code           0
All Grades              0
Total Students          0
AfriAmerican-Black-%    0
Primary                 0
Secondary               0
High                    0
District Name_y         0
Tests Taken             0
% Score 1-2             0
% Score 3-5             0
dtype: int64
merged_df3_l.isnull().sum()
District Name_x      0
District Code        0
All Grades           0
Total Students       0
Hispanic-Latino-%    0
Primary              0
Secondary            0
High                 0
District Name_y      0
Tests Taken          0
% Score 1-2          0
% Score 3-5          0
dtype: int64
merged_df_others_o_ap.isnull().sum()
District Name_x    0
District Code      0
All Grades         0
Total Students     0
Primary            0
Secondary          0
High               0
Others-%           0
District Name_y    0
Tests Taken        0
% Score 1-2        0
% Score 3-5        0
dtype: int64
merged_df3_l.head(5)
District Name_x District Code All Grades Total Students Hispanic-Latino-% Primary Secondary High District Name_y Tests Taken % Score 1-2 % Score 3-5
0 Abby Kelley Foster Charter Public (District) 4450000 0.0 293.0 0.000000 0.0 0.0 0.0 0 0.0 0.0 0.0
1 Abington 10000 1.0 281.0 0.355872 0.0 0.0 1.0 Abington 2.0 0.0 0.0
2 Academy Of the Pacific Rim Charter Public (Dis... 4120000 8.0 144.0 5.555556 0.0 0.0 8.0 Academy Of the Pacific Rim Charter Public (Dis... 5.0 0.0 0.0
3 Acton-Boxborough 6000000 3.0 379.0 0.791557 0.0 0.0 3.0 Acton-Boxborough 4.0 0.0 0.0
4 Acushnet 30000 0.0 42.0 0.000000 0.0 0.0 0.0 0 0.0 0.0 0.0
merged_df_others_o_ap.head(5)
District Name_x District Code All Grades Total Students Primary Secondary High Others-% District Name_y Tests Taken % Score 1-2 % Score 3-5
0 Abington 10000 259.0 1658.0 0.0 259.0 0.0 15.621230 Abington 25.0 0.160000 0.680000
1 Academy Of the Pacific Rim Charter Public (Dis... 4120000 0.0 36.0 0.0 0.0 0.0 0.000000 Academy Of the Pacific Rim Charter Public (Dis... 0.0 0.000000 0.000000
2 Acton-Boxborough 6000000 722.0 4322.0 0.0 686.0 36.0 16.705229 Acton-Boxborough 305.0 0.062295 0.937705
3 Advanced Math and Science Academy Charter (Dis... 4300000 315.0 808.0 0.0 225.0 90.0 38.985149 Advanced Math and Science Academy Charter (Dis... 135.0 0.111111 0.888889
4 Agawam 50000 883.0 2789.0 0.0 867.0 16.0 31.660093 Agawam 37.0 0.432432 0.459459
merged_df1_l.head(5)
District Name_x District Code All Grades Total Students AfriAmerican-Black-% Primary Secondary High District Name_y Tests Taken % Score 1-2 % Score 3-5
0 Abby Kelley Foster Charter Public (District) 4450000 0.0 775.0 0.000000 0.0 0.0 0.0 0 0.0 0.0 0.0
1 Abington 10000 1.0 111.0 0.900901 0.0 0.0 1.0 Abington 1.0 0.0 0.0
2 Academy Of the Pacific Rim Charter Public (Dis... 4120000 14.0 307.0 4.560261 0.0 0.0 14.0 Academy Of the Pacific Rim Charter Public (Dis... 7.0 0.0 0.0
3 Acton-Boxborough 6000000 0.0 153.0 0.000000 0.0 0.0 0.0 0 0.0 0.0 0.0
4 Acushnet 30000 0.0 12.0 0.000000 0.0 0.0 0.0 0 0.0 0.0 0.0
merged_df3_l.head(5)
District Name_x District Code All Grades Total Students Hispanic-Latino-% Primary Secondary High District Name_y Tests Taken % Score 1-2 % Score 3-5
0 Abby Kelley Foster Charter Public (District) 4450000 0.0 293.0 0.000000 0.0 0.0 0.0 0 0.0 0.0 0.0
1 Abington 10000 1.0 281.0 0.355872 0.0 0.0 1.0 Abington 2.0 0.0 0.0
2 Academy Of the Pacific Rim Charter Public (Dis... 4120000 8.0 144.0 5.555556 0.0 0.0 8.0 Academy Of the Pacific Rim Charter Public (Dis... 5.0 0.0 0.0
3 Acton-Boxborough 6000000 3.0 379.0 0.791557 0.0 0.0 3.0 Acton-Boxborough 4.0 0.0 0.0
4 Acushnet 30000 0.0 42.0 0.000000 0.0 0.0 0.0 0 0.0 0.0 0.0
!ls
AfricAmerican-Black                     artcourse-all-students.xlsx
AmericanIndian-Natives                  merged_df_ap_others_ap_performance.xlsx
Hispanic-Latino                         merged_df_ap_performance_others.xlsx
Others-(Asian + White)                  old_merged_df_others.xlsx
ap_performance_allstudents.xlsx
merged_df1_i.dtypes
District Name_x          object
District Code             int64
All Grades              float64
Total Students          float64
AfriAmerican-Black-%    float64
Primary                 float64
Secondary               float64
High                    float64
District Name_y          object
Tests Taken             float64
% Score 1-2             float64
% Score 3-5             float64
dtype: object
!ls
OLD_merged_df_ap_others_ap_performance_cleaned.xlsx
ap_performance_asian.xlsx
ap_performance_white.xlsx
artcourse-asian.xlsx
artcourse-white.xlsx
merged_df_ap_performance_others.xlsx
merged_df_others_cs-class.xlsx
merged_df1_i.to_excel('merged_df_Afric-American.xlsx', index=False)
merged_df3_i.to_excel('merged_df_Hispanic-Latino.xlsx', index=False)
merged_df_others_o_ap.to_excel('merged_df_Others-(Asian+White).xlsx', index=False)
Not Good Scatter Plots
import matplotlib.pyplot as plt
scatter_data = merged_df2[['Sum_K-3', 'Sum_04-08', 'Sum_09-12', 'Tests Taken']]

# Creating the scatter plot
plt.scatter(scatter_data['Tests Taken'], scatter_data['Sum_K-3'], label='Sum_K-3')
plt.scatter(scatter_data['Tests Taken'], scatter_data['Sum_04-08'], label='Sum_04-08')
plt.scatter(scatter_data['Tests Taken'], scatter_data['Sum_09-12'], label='Sum_09-12')

# Adding labels and legend
plt.xlabel('Tests Taken')
plt.ylabel('Sum Values')
plt.legend()

# Displaying the plot
plt.show()
import pandas as pd
import matplotlib.pyplot as plt

# Assuming you already have the DataFrame 'merged_df2'

# Extracting the required columns
scatter_data = merged_df2[['Sum_K-3', 'Sum_04-08', 'Sum_09-12', 'Tests Taken']]

# Creating the scatter plot
plt.scatter(scatter_data['Tests Taken'], scatter_data['Sum_K-3'], label='Sum_K-3', alpha=0.5)
plt.scatter(scatter_data['Tests Taken'], scatter_data['Sum_04-08'], label='Sum_04-08', alpha=0.5)
plt.scatter(scatter_data['Tests Taken'], scatter_data['Sum_09-12'], label='Sum_09-12', alpha=0.5)

# Adjusting x-axis scale
plt.xlim(0, 200)

# Adding labels, legend, and gridlines
plt.xlabel('Tests Taken')
plt.ylabel('Sum Values')
plt.legend()
plt.grid(True)

# Displaying the plot
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Assuming you already have the DataFrame 'merged_df2'

# Create a 3D scatter plot
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

# Extract the required columns
x = merged_df2['Tests Taken']
y = merged_df2['Sum_K-3']
z = merged_df2['Sum_04-08']
c = merged_df2['Sum_09-12']

# Scatter plot with color-coded points based on 'Sum_09-12'
scatter = ax.scatter(x, y, z, c=c, cmap='viridis')

# Set labels and title
ax.set_xlabel('Tests Taken')
ax.set_ylabel('Sum_K-3')
ax.set_zlabel('Sum_04-08')
ax.set_title('3D Scatter Plot')

# Add a colorbar
cbar = plt.colorbar(scatter)
cbar.set_label('Sum_09-12')

# Show the plot
plt.show()
More Scatter Plot Exploration
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming you already have the DataFrame 'merged_df2'

# Select the required columns
heatmap_data = merged_df2[['Sum_K-3', 'Sum_04-08', 'Sum_09-12', 'Tests Taken']]

# Compute the correlation matrix
correlation_matrix = heatmap_data.corr()

# Create a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')

# Set the title
plt.title('Heatmap of Correlation')

# Show the plot
plt.show()
# from pandas_profiling import ProfileReport

# # Assuming you already have the DataFrame 'merged_df2'

# # Generate the pandas profiling report
# profile = ProfileReport(merged_df2, title='Pandas Profiling Report')

# # Display the report as an interactive widget form
# profile.to_widgets()
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Assuming you already have the DataFrame 'merged_df2'

# Create a 3D scatter plot
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

# Extract the required columns
x = merged_df2['Tests Taken']
y = merged_df2['Sum_K-3']
z = merged_df2['Sum_04-08']
c = merged_df2['Sum_09-12']
labels = merged_df2['District Name_x']

# Scatter plot with color-coded points based on 'Sum_09-12'
scatter = ax.scatter(x, y, z, c=c, cmap='viridis')

# Add labels to data points
for i, label in enumerate(labels):
    ax.text(x[i], y[i], z[i], label, color='black', fontsize=8, ha='center', va='center')

# Set labels and title
ax.set_xlabel('Tests Taken')
ax.set_ylabel('Sum_K-3')
ax.set_zlabel('Sum_04-08')
ax.set_title('3D Scatter Plot')

# Add a colorbar
cbar = plt.colorbar(scatter)
cbar.set_label('Sum_09-12')

# Show the plot
plt.show()
merged_df1_i.dtypes
District Name_x          object
District Code             int64
All Grades              float64
Total Students          float64
AfriAmerican-Black-%    float64
Primary                 float64
Secondary               float64
High                    float64
District Name_y          object
Tests Taken             float64
% Score 1-2             float64
% Score 3-5             float64
dtype: object
merged_df_others_o_ap.dtypes
District Name_x     object
District Code        int64
All Grades         float64
Total Students     float64
Primary            float64
Secondary          float64
High               float64
Others-%           float64
District Name_y     object
Tests Taken        float64
% Score 1-2        float64
% Score 3-5        float64
dtype: object
# column_index_to_drop = 7  # Specify the index of the column you want to drop
# merged_df_others_o_ap.drop(merged_df_others_o_ap.columns[column_index_to_drop], axis=1, inplace=True)
 
# columns_to_drop = ['K', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
# merged_df1_i.drop(columns=columns_to_drop, inplace=True)

# columns_to_drop = ['K', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
# merged_df3_i.drop(columns=columns_to_drop, inplace=True)
merged_df1_i.dtypes
District Name_x          object
District Code             int64
All Grades              float64
Total Students          float64
AfriAmerican-Black-%    float64
Primary                 float64
Secondary               float64
High                    float64
District Name_y          object
Tests Taken             float64
% Score 1-2             float64
% Score 3-5             float64
dtype: object
merged_df3_i.dtypes
District Name_x       object
District Code          int64
All Grades           float64
Total Students       float64
Hispanic-Latino-%    float64
Primary              float64
Secondary            float64
High                 float64
District Name_y       object
Tests Taken          float64
% Score 1-2          float64
% Score 3-5          float64
dtype: object
merged_df3_i.dtypes
District Name_x       object
District Code          int64
All Grades           float64
Total Students       float64
Hispanic-Latino-%    float64
Primary              float64
Secondary            float64
High                 float64
District Name_y       object
Tests Taken          float64
% Score 1-2          float64
% Score 3-5          float64
dtype: object
merged_df1_i.dtypes
District Name_x          object
District Code             int64
All Grades              float64
Total Students          float64
AfriAmerican-Black-%    float64
Primary                 float64
Secondary               float64
High                    float64
District Name_y          object
Tests Taken             float64
% Score 1-2             float64
% Score 3-5             float64
dtype: object
 
# merged_df1_i = merged_df1_i.drop(['CS course Binary for Sum_K-3', 'CS course Binary for Sum_04-08', 'CS course Binary for Sum_09-12'], axis=1)

Pandas Profiling

!pip install -U ydata-profiling
Requirement already satisfied: ydata-profiling in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (4.3.1)
Requirement already satisfied: pandas!=1.4.0,<2.1,>1.1 in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from ydata-profiling) (1.4.4)
Requirement already satisfied: typeguard<3,>=2.13.2 in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from ydata-profiling) (2.13.3)
Requirement already satisfied: imagehash==4.3.1 in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from ydata-profiling) (4.3.1)
Requirement already satisfied: wordcloud>=1.9.1 in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from ydata-profiling) (1.9.2)
Requirement already satisfied: multimethod<2,>=1.4 in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from ydata-profiling) (1.9.1)
Requirement already satisfied: scipy<1.11,>=1.4.1 in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from ydata-profiling) (1.9.1)
Requirement already satisfied: matplotlib<4,>=3.2 in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from ydata-profiling) (3.5.2)
Requirement already satisfied: tqdm<5,>=4.48.2 in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from ydata-profiling) (4.64.1)
Requirement already satisfied: visions[type_image_path]==0.7.5 in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from ydata-profiling) (0.7.5)
Requirement already satisfied: pydantic<2,>=1.8.1 in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from ydata-profiling) (1.10.9)
Requirement already satisfied: jinja2<3.2,>=2.11.1 in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from ydata-profiling) (2.11.3)
Requirement already satisfied: phik<0.13,>=0.11.1 in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from ydata-profiling) (0.12.3)
Requirement already satisfied: dacite>=1.8 in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from ydata-profiling) (1.8.1)
Requirement already satisfied: numpy<1.24,>=1.16.0 in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from ydata-profiling) (1.21.5)
Requirement already satisfied: seaborn<0.13,>=0.10.1 in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from ydata-profiling) (0.11.2)
Requirement already satisfied: htmlmin==0.1.12 in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from ydata-profiling) (0.1.12)
Requirement already satisfied: PyYAML<6.1,>=5.0.0 in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from ydata-profiling) (6.0)
Requirement already satisfied: statsmodels<1,>=0.13.2 in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from ydata-profiling) (0.13.2)
Requirement already satisfied: requests<3,>=2.24.0 in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from ydata-profiling) (2.28.1)
Requirement already satisfied: PyWavelets in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from imagehash==4.3.1->ydata-profiling) (1.3.0)
Requirement already satisfied: pillow in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from imagehash==4.3.1->ydata-profiling) (9.2.0)
Requirement already satisfied: attrs>=19.3.0 in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from visions[type_image_path]==0.7.5->ydata-profiling) (21.4.0)
Requirement already satisfied: tangled-up-in-unicode>=0.0.4 in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from visions[type_image_path]==0.7.5->ydata-profiling) (0.2.0)
Requirement already satisfied: networkx>=2.4 in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from visions[type_image_path]==0.7.5->ydata-profiling) (2.8.4)
Requirement already satisfied: MarkupSafe>=0.23 in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from jinja2<3.2,>=2.11.1->ydata-profiling) (2.0.1)
Requirement already satisfied: python-dateutil>=2.7 in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from matplotlib<4,>=3.2->ydata-profiling) (2.8.2)
Requirement already satisfied: packaging>=20.0 in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from matplotlib<4,>=3.2->ydata-profiling) (21.3)
Requirement already satisfied: pyparsing>=2.2.1 in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from matplotlib<4,>=3.2->ydata-profiling) (3.0.9)
Requirement already satisfied: fonttools>=4.22.0 in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from matplotlib<4,>=3.2->ydata-profiling) (4.25.0)
Requirement already satisfied: cycler>=0.10 in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from matplotlib<4,>=3.2->ydata-profiling) (0.11.0)
Requirement already satisfied: kiwisolver>=1.0.1 in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from matplotlib<4,>=3.2->ydata-profiling) (1.4.2)
Requirement already satisfied: pytz>=2020.1 in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from pandas!=1.4.0,<2.1,>1.1->ydata-profiling) (2022.1)
Requirement already satisfied: joblib>=0.14.1 in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from phik<0.13,>=0.11.1->ydata-profiling) (1.1.0)
Requirement already satisfied: typing-extensions>=4.2.0 in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from pydantic<2,>=1.8.1->ydata-profiling) (4.3.0)
Requirement already satisfied: charset-normalizer<3,>=2 in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from requests<3,>=2.24.0->ydata-profiling) (2.0.4)
Requirement already satisfied: idna<4,>=2.5 in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from requests<3,>=2.24.0->ydata-profiling) (3.3)
Requirement already satisfied: certifi>=2017.4.17 in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from requests<3,>=2.24.0->ydata-profiling) (2022.9.24)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from requests<3,>=2.24.0->ydata-profiling) (1.26.11)
Requirement already satisfied: patsy>=0.5.2 in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from statsmodels<1,>=0.13.2->ydata-profiling) (0.5.2)
Requirement already satisfied: six in /Users/priyank/opt/anaconda3/lib/python3.9/site-packages (from patsy>=0.5.2->statsmodels<1,>=0.13.2->ydata-profiling) (1.16.0)
!ls
from ydata_profiling import ProfileReport

profile1 = ProfileReport(merged_df1_i, title="District-wise Profiling Report of African American/Black CS Course Takers", minimal= False)
profile1.to_file("District-wise_African_American_CS_Course_Takers_Profiling_Report.html")

profile2 = ProfileReport(merged_df3_i, title="District-wise Profiling Report of Hispanic/Latino CS Course Takers", minimal= False)
profile2.to_file("District-wise_Hispanic_Latino_CS_Course_Takers_Profiling_Report.html")

profile3 = ProfileReport(merged_df_others_o_ap, title="District-wise Profiling Report of Others(Asian & White) CS Course Takers", minimal= False)
profile3.to_file("District-wise_Others_CS_Course_Takers_Profiling_Report.html")
!ls
Afri-Amer-Black.png    Hispanic-Latino.png    Others-Asian-White.png
from PIL import Image
import matplotlib.pyplot as plt
img = Image.open('image.jpg')
plt.imshow(img)
plt.title('Image Title')

Afri-Amer-Black.png    Hispanic-Latino.png    Others-Asian-White.png

Results from Pandas Profiling Reports for 3 Ethinicities & Digging Deeper into interesting Pair Wise Correlations:

1: African-American/Black
from PIL import Image
import matplotlib.pyplot as plt

img1 = Image.open('Afri-Amer-Black.png')
plt.imshow(img1)
plt.title('African American/Black CS Course Takers')
plt.show()

Take Aways from Above Heatmap of African-American/Black:

  • High School has high Correlation with All grades
merged_df1_i.dtypes
District Name_x          object
District Code             int64
All Grades              float64
Total Students          float64
AfriAmerican-Black-%    float64
Primary                 float64
Secondary               float64
High                    float64
District Name_y          object
Tests Taken             float64
% Score 1-2             float64
% Score 3-5             float64
dtype: object
import matplotlib.pyplot as plt

# Filtering 'Secondary' values to be within the range 0-100
filtered_merged_df1_i = merged_df1_i[(merged_df1_i['Secondary'] >= 0) & (merged_df1_i['Secondary'] <= 100)]

# Scatter plot
plt.scatter(filtered_merged_df1_i['Secondary'], filtered_merged_df1_i['% Score 3-5'])
plt.xlabel('Secondary')
plt.ylabel('% Score 3-5')
plt.title('Scatter Plot: % Score 3-5 vs. Secondary')
plt.grid(True)
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt

# Filtering 'Secondary' values to be within the range 0-100
filtered_merged_df1_i = merged_df1_i[(merged_df1_i['Secondary'] >= 0) & (merged_df1_i['Secondary'] <= 100)]

# Scatter plot using Seaborn
sns.scatterplot(x='Secondary', y='% Score 3-5', data=filtered_merged_df1_i)
plt.xlabel('Secondary')
plt.ylabel('% Score 3-5')
plt.title('Scatter Plot: Secondary vs  % Score 3-5')
plt.grid(True)
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt

# Filtering 'High' values to be within the range 0-100
filtered_merged_df1_i = merged_df1_i[(merged_df1_i['High'] >= 0) & (merged_df1_i['High'] <= 100)]

# Scatter plot using Seaborn
sns.scatterplot(x='High', y='% Score 3-5', data=filtered_merged_df1_i)
plt.xlabel('High')
plt.ylabel('% Score 3-5')
plt.title('Scatter Plot: High vs  % Score 3-5')
plt.grid(True)
plt.show()
merged_df1_i.dtypes
District Name_x          object
District Code             int64
All Grades              float64
Total Students          float64
AfriAmerican-Black-%    float64
Primary                 float64
Secondary               float64
High                    float64
District Name_y          object
Tests Taken             float64
% Score 1-2             float64
% Score 3-5             float64
dtype: object
import seaborn as sns
import matplotlib.pyplot as plt

# Filtering 'High' values to be within the range 0-200
filtered_merged_df1_i = merged_df1_i[(merged_df1_i['High'] >= 0) & (merged_df1_i['High'] <= 200)]

# Scatter plot using Seaborn
sns.scatterplot(x='High', y='All Grades', data=filtered_merged_df1_i)
plt.xlabel('High')
plt.ylabel('All Grades')
plt.title('Scatter Plot: High vs All Grades')
plt.grid(True)
plt.show()

The Above Scatter Plot Showcases Clear Correlation between High School and All Grades indicating this Ethnic Group contribution to ALL grades is higher in later classes than primary or secondary

merged_df1_i.dtypes
District Name_x          object
District Code             int64
All Grades              float64
Total Students          float64
AfriAmerican-Black-%    float64
Primary                 float64
Secondary               float64
High                    float64
District Name_y          object
Tests Taken             float64
% Score 1-2             float64
% Score 3-5             float64
dtype: object
import seaborn as sns
import matplotlib.pyplot as plt

# Scatter plot using Seaborn
sns.scatterplot(x='AfriAmerican-Black-%', y='% Score 3-5', data=merged_df1_i)
plt.xlabel('AfriAmerican-Black-%')
plt.ylabel('% Score 3-5')
plt.title('Scatter Plot: AfriAmerican-Black-% vs Passing Percentage')
plt.grid(True)
plt.show()
2: Hispanic/Latino
from PIL import Image
import matplotlib.pyplot as plt

img2 = Image.open('Hispanic-Latino.png')
plt.imshow(img2)
plt.title('Hispanic/Latino CS Course Takers')
plt.show()

Take Aways from Above Heatmap of Hispanic/Latino:

  • High School has high Correlation with All grades
merged_df3_i.dtypes
District Name_x       object
District Code          int64
All Grades           float64
Total Students       float64
Hispanic-Latino-%    float64
Primary              float64
Secondary            float64
High                 float64
District Name_y       object
Tests Taken          float64
% Score 1-2          float64
% Score 3-5          float64
dtype: object
import seaborn as sns
import matplotlib.pyplot as plt

# Scatter plot using Seaborn
sns.scatterplot(x='Hispanic-Latino-%', y='% Score 3-5', data=merged_df3_i)
plt.xlabel('Hispanic-Latino-%')
plt.ylabel('% Score 3-5')
plt.title('Scatter Plot: Hispanic-Latino-% vs Passing Percentage')
plt.grid(True)
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt

# Filtering 'High' values to be within the range 0-200
filtered_merged_df3_i = merged_df3_i[(merged_df3_i['High'] >= 0) & (merged_df3_i['High'] <= 200)]
filtered_merged_df3_i_primary = merged_df3_i[(merged_df3_i['Primary'] >= 0) & (merged_df3_i['Primary'] <= 200)]
filtered_merged_df3_i_secondary = merged_df3_i[(merged_df3_i['Secondary'] >= 0) & (merged_df3_i['Secondary'] <= 200)]

# Scatter plot: High vs All Grades
plt.figure(figsize=(8, 6))
sns.scatterplot(x='High', y='All Grades', data=filtered_merged_df3_i)
plt.xlabel('High School Students (% of Total Students)')
plt.ylabel('All Grades')
plt.title('Correlation between High School Students and All Grades')
plt.grid(True)
plt.show()

# Scatter plot: Primary vs All Grades
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Primary', y='All Grades', data=filtered_merged_df3_i_primary)
plt.xlabel('Primary School Students (% of Total Students)')
plt.ylabel('All Grades')
plt.title('Correlation between Primary School Students and All Grades')
plt.grid(True)
plt.show()

# Scatter plot: Secondary vs All Grades
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Secondary', y='All Grades', data=filtered_merged_df3_i_secondary)
plt.xlabel('Secondary School Students (% of Total Students)')
plt.ylabel('All Grades')
plt.title('Correlation between Secondary School Students and All Grades')
plt.grid(True)
plt.show()

The Above 3 Scatter Plots Showcases a Strong Correlation between High School and All Grades & High Overall Correlation between secondary & All grades.

3: Others: (Asian + White)
from PIL import Image
import matplotlib.pyplot as plt

img3 = Image.open('Others-Asian-White.png')
plt.imshow(img3)
plt.title('Other/Asian/White CS Course Takers')
plt.show()

Others (Asian & White):

  • Percentage of District wise participation in CS Classes is Highly Proportional to All grades
  • Secondary is highly proportional to all grades & Primary is also contributes to all grades but less than secondary
merged_df_others_o_ap.dtypes
District Name_x     object
District Code        int64
All Grades         float64
Total Students     float64
Primary            float64
Secondary          float64
High               float64
Others-%           float64
District Name_y     object
Tests Taken        float64
% Score 1-2        float64
% Score 3-5        float64
dtype: object
import seaborn as sns
import matplotlib.pyplot as plt

# Filtering 'High' values to be within the range 0-200
filtered_merged_df_others_o_ap = merged_df_others_o_ap[(merged_df_others_o_ap['High'] >= 0) & (merged_df_others_o_ap['High'] <= 200)]
filtered_merged_df_others_o_ap_primary = merged_df_others_o_ap[(merged_df_others_o_ap['Primary'] >= 0) & (merged_df_others_o_ap['Primary'] <= 200)]
filtered_merged_df_others_o_ap_secondary = merged_df_others_o_ap[(merged_df_others_o_ap['Secondary'] >= 0) & (merged_df_others_o_ap['Secondary'] <= 200)]

# Scatter plot: High vs All Grades
plt.figure(figsize=(8, 6))
sns.scatterplot(x='High', y='All Grades', data=filtered_merged_df_others_o_ap)
plt.xlabel('High School Students (% of Total Students)')
plt.ylabel('All Grades')
plt.title('Correlation between High School Students and All Grades')
plt.grid(True)
plt.show()

# Scatter plot: Primary vs All Grades
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Primary', y='All Grades', data=filtered_merged_df_others_o_ap_primary)
plt.xlabel('Primary School Students (% of Total Students)')
plt.ylabel('All Grades')
plt.title('Correlation between Primary School Students and All Grades')
plt.grid(True)
plt.show()

# Scatter plot: Secondary vs All Grades
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Secondary', y='All Grades', data=filtered_merged_df_others_o_ap_secondary)
plt.xlabel('Secondary School Students (% of Total Students)')
plt.ylabel('All Grades')
plt.title('Correlation between Secondary School Students and All Grades')
plt.grid(True)
plt.show()

The Above 3 Scatter Plots Showcases a Strong Overall Correlation between High School, Secondary & Primary vs All grades.

import seaborn as sns
import matplotlib.pyplot as plt

# Scatter plot using Seaborn
sns.scatterplot(x='Others-%', y='% Score 3-5', data=merged_df_others_o_ap)
plt.xlabel('Others-%')
plt.ylabel('% Score 3-5')
plt.title('Scatter Plot: Others(Asian&White)-% vs Passing Percentage')
plt.grid(True)
plt.show()

Dual Box Plots Creation on Basis of CS Course Offered or CS Course Not-Offered:

print(merged_df_others_o_ap.dtypes, merged_df1_i.dtypes, merged_df3_i.dtypes)
District Name_x     object
District Code        int64
All Grades         float64
Total Students     float64
Primary            float64
Secondary          float64
High               float64
Others-%           float64
District Name_y     object
Tests Taken        float64
% Score 1-2        float64
% Score 3-5        float64
dtype: object District Name_x          object
District Code             int64
All Grades              float64
Total Students          float64
AfriAmerican-Black-%    float64
Primary                 float64
Secondary               float64
High                    float64
District Name_y          object
Tests Taken             float64
% Score 1-2             float64
% Score 3-5             float64
dtype: object District Name_x       object
District Code          int64
All Grades           float64
Total Students       float64
Hispanic-Latino-%    float64
Primary              float64
Secondary            float64
High                 float64
District Name_y       object
Tests Taken          float64
% Score 1-2          float64
% Score 3-5          float64
dtype: object
1: AfricanAmerican/Black
merged_df1_i.dtypes
District Name_x          object
District Code             int64
All Grades              float64
Total Students          float64
AfriAmerican-Black-%    float64
Primary                 float64
Secondary               float64
High                    float64
District Name_y          object
Tests Taken             float64
% Score 1-2             float64
% Score 3-5             float64
dtype: object
!ls
Hispanic_Latino-Box-Plot(Highvs3-5%).png
Hispanic_Latino-Box-Plot(Primaryvs3-5%).png
Hispanic_Latino-Box-Plot(Secondaryvs3-5%).png
Others-Box-Plot(Highvs3-5%).png
Others-Box-Plot(Primaryvs3-5%).png
Others-Box-Plot(Secondaryvs3-5%).png
import matplotlib.pyplot as plt
import seaborn as sns

# Filter the data and assign binary logic for every different category
merged_df1_i['CS course Binary for Primary'] = merged_df1_i['Primary'].apply(lambda x: 1 if x > 0 else 0)
merged_df1_i['CS course Binary for Secondary'] = merged_df1_i['Secondary'].apply(lambda x: 1 if x > 0 else 0)
merged_df1_i['CS course Binary for High'] = merged_df1_i['High'].apply(lambda x: 1 if x > 0 else 0)

# Filter the data for the two variables
data_cs_course_offered = merged_df1_i[merged_df1_i['CS course Binary for Primary'] == 1]
data_cs_course_not_offered = merged_df1_i[merged_df1_i['CS course Binary for Primary'] == 0]

fig, ax = plt.subplots()

# Create the box plots with different colors for each group
boxplot_data = [data_cs_course_offered['% Score 3-5'], data_cs_course_not_offered['% Score 3-5']]
boxplot_colors = ['blue', 'red']


boxplot = ax.boxplot(boxplot_data, labels=['CS course offered', 'CS course not offered'], patch_artist=True)



# Apply different colors to each box plot
for box, color in zip(boxplot['boxes'], boxplot_colors):
    box.set(facecolor=color)

# Show markers and outliers
for flier in boxplot['fliers']:
    flier.set(marker='o', markersize=5)

ax.set_xlabel('CS course offered (0: Not Offered, 1: Offered)')

# Set the y-axis label
ax.set_ylabel('Percentage of Passing Scores')

# Set the title
plt.title('Comparison of Primary Schools and % of Scores 3-5 for African American Group')

plt.savefig('African-American-Box Plot(Primaryvs3-5%).png')

plt.show()
import matplotlib.pyplot as plt
import seaborn as sns

# Filter the data and assign binary logic for every different category
merged_df1_i['CS course Binary for Primary'] = merged_df1_i['Primary'].apply(lambda x: 1 if x > 0 else 0)
merged_df1_i['CS course Binary for Secondary'] = merged_df1_i['Secondary'].apply(lambda x: 1 if x > 0 else 0)
merged_df1_i['CS course Binary for High'] = merged_df1_i['High'].apply(lambda x: 1 if x > 0 else 0)

# Filter the data for the two variables
data_cs_course_offered = merged_df1_i[merged_df1_i['CS course Binary for Secondary'] == 1]
data_cs_course_not_offered = merged_df1_i[merged_df1_i['CS course Binary for Secondary'] == 0]

fig, ax = plt.subplots()

# Create the box plots with different colors for each group
boxplot_data = [data_cs_course_offered['% Score 3-5'], data_cs_course_not_offered['% Score 3-5']]
boxplot_colors = ['blue', 'red']


boxplot = ax.boxplot(boxplot_data, labels=['CS course offered', 'CS course not offered'], patch_artist=True)



# Apply different colors to each box plot
for box, color in zip(boxplot['boxes'], boxplot_colors):
    box.set(facecolor=color)

# Show markers and outliers
for flier in boxplot['fliers']:
    flier.set(marker='o', markersize=5)

ax.set_xlabel('CS course offered (0: Not Offered, 1: Offered)')

# Set the y-axis label
ax.set_ylabel('Percentage of Passing Scores')

# Set the title
plt.title('Comparison of Secondary Schools and % of Scores 3-5 for African American Group')

plt.savefig('African-American-Box Plot(Secondary vs 3-5%).png')

plt.show()
import matplotlib.pyplot as plt
import seaborn as sns

# Filter the data and assign binary logic for every different category
merged_df1_i['CS course Binary for Primary'] = merged_df1_i['Primary'].apply(lambda x: 1 if x > 0 else 0)
merged_df1_i['CS course Binary for Secondary'] = merged_df1_i['Secondary'].apply(lambda x: 1 if x > 0 else 0)
merged_df1_i['CS course Binary for High'] = merged_df1_i['High'].apply(lambda x: 1 if x > 0 else 0)

# Filter the data for the two variables
data_cs_course_offered = merged_df1_i[merged_df1_i['CS course Binary for High'] == 1]
data_cs_course_not_offered = merged_df1_i[merged_df1_i['CS course Binary for High'] == 0]

fig, ax = plt.subplots()

# Create the box plots with different colors for each group
boxplot_data = [data_cs_course_offered['% Score 3-5'], data_cs_course_not_offered['% Score 3-5']]
boxplot_colors = ['blue', 'red']


boxplot = ax.boxplot(boxplot_data, labels=['CS course offered', 'CS course not offered'], patch_artist=True)



# Apply different colors to each box plot
for box, color in zip(boxplot['boxes'], boxplot_colors):
    box.set(facecolor=color)

# Show markers and outliers
for flier in boxplot['fliers']:
    flier.set(marker='o', markersize=5)

ax.set_xlabel('CS course offered (0: Not Offered, 1: Offered)')

# Set the y-axis label
ax.set_ylabel('Percentage of Passing Scores')

# Set the title
plt.title('Comparison of High Schools and % of Scores 3-5 for African American Group')

plt.savefig('African-American-Box Plot(High vs 3-5%).png')

plt.show()
2.Hispanic/Latino
!ls
African-American-Box Plot(High vs 3-5%).png
African-American-Box Plot(Primaryvs3-5%).png
African-American-Box Plot(Secondary vs 3-5%).png
import os
!ls
Afri-Amer-Black.png    Hispanic-Latino.png
Dual-Axis-Box-Plots    Others-Asian-White.png
import matplotlib.pyplot as plt
import seaborn as sns
# Filter the data and assign binary logic for every different category
merged_df3_i['CS course Binary for Primary'] = merged_df3_i['Primary'].apply(lambda x: 1 if x > 0 else 0)
merged_df3_i['CS course Binary for Secondary'] = merged_df3_i['Secondary'].apply(lambda x: 1 if x > 0 else 0)
merged_df3_i['CS course Binary for High'] = merged_df3_i['High'].apply(lambda x: 1 if x > 0 else 0)






# Filter the data for the two variables
data_cs_course_offered = merged_df3_i[merged_df3_i['CS course Binary for Primary'] == 1]
data_cs_course_not_offered = merged_df3_i[merged_df3_i['CS course Binary for Primary'] == 0]
fig, ax = plt.subplots()
# Create the box plots with different colors for each group
boxplot_data = [data_cs_course_offered['% Score 3-5'], data_cs_course_not_offered['% Score 3-5']]
boxplot_colors = ['blue', 'red']
boxplot = ax.boxplot(boxplot_data, labels=['CS course offered', 'CS course not offered'], patch_artist=True)
# Apply different colors to each box plot
for box, color in zip(boxplot['boxes'], boxplot_colors):
    box.set(facecolor=color)
# Show markers and outliers
for flier in boxplot['fliers']:
    flier.set(marker='o', markersize=5)
ax.set_xlabel('CS course offered (0: Not Offered, 1: Offered)')
# Set the y-axis label
ax.set_ylabel('Percentage of Passing Scores')
# Set the title
plt.title('Comparison of Primary Schools and % of Scores 3-5 for Hispanics/Latino Group')

# Save the plot before displaying it
output_directory = 'Dual-Axis-Box-Plots'  # Change this to your desired output directory
os.makedirs(output_directory, exist_ok=True)  # Create the directory if it doesn't exist
output_file_path = os.path.join(output_directory, 'Hispanic_Latino-Box-Plot(Primaryvs3-5%).png')
plt.savefig(output_file_path)

# Display the plot
plt.show()
data_cs_course_offered = merged_df3_i[merged_df3_i['CS course Binary for Secondary'] == 1]
data_cs_course_not_offered = merged_df3_i[merged_df3_i['CS course Binary for Secondary'] == 0]

fig, ax = plt.subplots()

# Create the box plots with different colors for each group
boxplot_data = [data_cs_course_offered['% Score 3-5'], data_cs_course_not_offered['% Score 3-5']]
boxplot_colors = ['blue', 'red']
boxplot = ax.boxplot(boxplot_data, labels=['CS course offered', 'CS course not offered'], patch_artist=True)
# Apply different colors to each box plot
for box, color in zip(boxplot['boxes'], boxplot_colors):
    box.set(facecolor=color)
# Show markers and outliers
for flier in boxplot['fliers']:
    flier.set(marker='o', markersize=5)
ax.set_xlabel('CS course offered (0: Not Offered, 1: Offered)')
# Set the y-axis label
ax.set_ylabel('Percentage of Passing Scores')


# Set the title
plt.title('Comparison of Secondary Schools and % of Scores 3-5 for Hispanics/Latino Group')

# Save the plot before displaying it
output_directory = 'Dual-Axis-Box-Plots'  # Change this to your desired output directory
os.makedirs(output_directory, exist_ok=True)  # Create the directory if it doesn't exist
output_file_path = os.path.join(output_directory, 'Hispanic_Latino-Box-Plot(Secondaryvs3-5%).png')
plt.savefig(output_file_path)

# Display the plot
plt.show()
data_cs_course_offered = merged_df3_i[merged_df3_i['CS course Binary for High'] == 1]
data_cs_course_not_offered = merged_df3_i[merged_df3_i['CS course Binary for High'] == 0]

fig, ax = plt.subplots()

# Create the box plots with different colors for each group
boxplot_data = [data_cs_course_offered['% Score 3-5'], data_cs_course_not_offered['% Score 3-5']]
boxplot_colors = ['blue', 'red']
boxplot = ax.boxplot(boxplot_data, labels=['CS course offered', 'CS course not offered'], patch_artist=True)
# Apply different colors to each box plot
for box, color in zip(boxplot['boxes'], boxplot_colors):
    box.set(facecolor=color)
# Show markers and outliers
for flier in boxplot['fliers']:
    flier.set(marker='o', markersize=5)
ax.set_xlabel('CS course offered (0: Not Offered, 1: Offered)')
# Set the y-axis label
ax.set_ylabel('Percentage of Passing Scores')


# Set the title
plt.title('Comparison of High Schools and % of Scores 3-5 for Hispanics/Latino Group')

# Save the plot before displaying it
output_directory = 'Dual-Axis-Box-Plots'  # Change this to your desired output directory
os.makedirs(output_directory, exist_ok=True)  # Create the directory if it doesn't exist
output_file_path = os.path.join(output_directory, 'Hispanic_Latino-Box-Plot(Highvs3-5%).png')
plt.savefig(output_file_path)

# Display the plot
plt.show()
3. Others Ethinic Group (Asian & White)
merged_df_others_o_ap.dtypes
District Name_x     object
District Code        int64
All Grades         float64
Total Students     float64
Primary            float64
Secondary          float64
High               float64
Others-%           float64
District Name_y     object
Tests Taken        float64
% Score 1-2        float64
% Score 3-5        float64
dtype: object
import matplotlib.pyplot as plt
import seaborn as sns

# Filter the data and assign binary logic for every different category
merged_df_others_o_ap['CS course Binary for Primary'] = merged_df_others_o_ap['Primary'].apply(lambda x: 1 if x > 0 else 0)
merged_df_others_o_ap['CS course Binary for Secondary'] = merged_df_others_o_ap['Secondary'].apply(lambda x: 1 if x > 0 else 0)
merged_df_others_o_ap['CS course Binary for High'] = merged_df_others_o_ap['High'].apply(lambda x: 1 if x > 0 else 0)
import matplotlib.pyplot as plt
import seaborn as sns
# Filter the data for the two variables
data_cs_course_offered = merged_df_others_o_ap[merged_df_others_o_ap['CS course Binary for Primary'] == 1]
data_cs_course_not_offered = merged_df_others_o_ap[merged_df_others_o_ap['CS course Binary for Primary'] == 0]


# Create a figure and axes
fig, ax = plt.subplots()

# Create the box plots with different colors for each group
boxplot_data = [data_cs_course_offered['% Score 3-5'], data_cs_course_not_offered['% Score 3-5']]
boxplot_colors = ['blue', 'red']
boxplot = ax.boxplot(boxplot_data, labels=['CS course offered', 'CS course not offered'], patch_artist=True)

# Apply different colors to each box plot
for box, color in zip(boxplot['boxes'], boxplot_colors):
    box.set(facecolor=color)

# Show markers and outliers
for flier in boxplot['fliers']:
    flier.set(marker='o', markersize=5)

# Set the x-axis label
ax.set_xlabel('CS course offered (0: Not Offered, 1: Offered)')

# Set the y-axis label
ax.set_ylabel('Percentage of Passing Scores')

# Set the title
plt.title('Comparison of Primary School Category and % of Scores 3-5 for Others Ethnic Group (Asian + White)')

# Save the plot before displaying it
output_directory = 'Dual-Axis-Box-Plots'  # Change this to your desired output directory
os.makedirs(output_directory, exist_ok=True)  # Create the directory if it doesn't exist
output_file_path = os.path.join(output_directory, 'Others-Box-Plot(Primaryvs3-5%).png')
plt.savefig(output_file_path)

# Display the plot
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
# Filter the data for the two variables
data_cs_course_offered = merged_df_others_o_ap[merged_df_others_o_ap['CS course Binary for Secondary'] == 1]
data_cs_course_not_offered = merged_df_others_o_ap[merged_df_others_o_ap['CS course Binary for Secondary'] == 0]


# Create a figure and axes
fig, ax = plt.subplots()

# Create the box plots with different colors for each group
boxplot_data = [data_cs_course_offered['% Score 3-5'], data_cs_course_not_offered['% Score 3-5']]
boxplot_colors = ['blue', 'red']
boxplot = ax.boxplot(boxplot_data, labels=['CS course offered', 'CS course not offered'], patch_artist=True)

# Apply different colors to each box plot
for box, color in zip(boxplot['boxes'], boxplot_colors):
    box.set(facecolor=color)

# Show markers and outliers
for flier in boxplot['fliers']:
    flier.set(marker='o', markersize=5)

# Set the x-axis label
ax.set_xlabel('CS course offered (0: Not Offered, 1: Offered)')

# Set the y-axis label
ax.set_ylabel('Percentage of Passing Scores')

# Set the title
plt.title('Comparison of Secondary School Category and % of Scores 3-5 for Others Ethnic Group (Asian + White)')

# Save the plot before displaying it
output_directory = 'Dual-Axis-Box-Plots'  # Change this to your desired output directory
os.makedirs(output_directory, exist_ok=True)  # Create the directory if it doesn't exist
output_file_path = os.path.join(output_directory, 'Others-Box-Plot(Secondaryvs3-5%).png')
plt.savefig(output_file_path)

# Display the plot
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
# Filter the data for the two variables
data_cs_course_offered = merged_df_others_o_ap[merged_df_others_o_ap['CS course Binary for High'] == 1]
data_cs_course_not_offered = merged_df_others_o_ap[merged_df_others_o_ap['CS course Binary for High'] == 0]


# Create a figure and axes
fig, ax = plt.subplots()

# Create the box plots with different colors for each group
boxplot_data = [data_cs_course_offered['% Score 3-5'], data_cs_course_not_offered['% Score 3-5']]
boxplot_colors = ['blue', 'red']
boxplot = ax.boxplot(boxplot_data, labels=['CS course offered', 'CS course not offered'], patch_artist=True)

# Apply different colors to each box plot
for box, color in zip(boxplot['boxes'], boxplot_colors):
    box.set(facecolor=color)

# Show markers and outliers
for flier in boxplot['fliers']:
    flier.set(marker='o', markersize=5)

# Set the x-axis label
ax.set_xlabel('CS course offered (0: Not Offered, 1: Offered)')

# Set the y-axis label
ax.set_ylabel('Percentage of Passing Scores')

# Set the title
plt.title('Comparison of High School Category and % of Scores 3-5 for Others Ethnic Group (Asian + White)')

# Save the plot before displaying it
output_directory = 'Dual-Axis-Box-Plots'  # Change this to your desired output directory
os.makedirs(output_directory, exist_ok=True)  # Create the directory if it doesn't exist
output_file_path = os.path.join(output_directory, 'Others-Box-Plot(Highvs3-5%).png')
plt.savefig(output_file_path)

# Display the plot
plt.show()

Pandas profiling comparison report

!ls
District-wise_African_American_CS_Course_Takers_Profiling_Report.html
District-wise_Hispanic_Latino_CS_Course_Takers_Profiling_Report.html
District-wise_Others_CS_Course_Takers_Profiling_Report.html
from ydata_profiling import ProfileReport, compare

comparison_report = compare([profile1, profile2, profile3])

# Save the comparison report to a file
comparison_report.to_file("Comparison-AfrAmer-HispBlack-Others.html")
/Users/priyank/opt/anaconda3/lib/python3.9/site-packages/ydata_profiling/compare_reports.py:173: UserWarning: Comparison of more than two reports is not supported. Reports may be produced, but may yield unexpected formatting.
  warnings.warn(
/Users/priyank/opt/anaconda3/lib/python3.9/site-packages/ydata_profiling/compare_reports.py:195: UserWarning: The datasets being profiled have a different set of columns. Only the left side profile will be calculated.
  warnings.warn(

Misscelenaous (Extra Work)

Extra Plots
import pandas as pd
import matplotlib.pyplot as plt

# Assuming you already have the DataFrame 'merged_df1'

# Extracting the required columns
scatter_data = merged_df1[['Sum_K-3', 'Sum_04-08', 'Sum_09-12', 'Tests Taken']]

# Creating the scatter plot
plt.scatter(scatter_data['Tests Taken'], scatter_data['Sum_K-3'], label='Sum_K-3', alpha=0.5)
plt.scatter(scatter_data['Tests Taken'], scatter_data['Sum_04-08'], label='Sum_04-08', alpha=0.5)
plt.scatter(scatter_data['Tests Taken'], scatter_data['Sum_09-12'], label='Sum_09-12', alpha=0.5)

# Adjusting x-axis scale
plt.xlim(0, 25)

# Adding labels, legend, and gridlines
plt.xlabel('Tests Taken')
plt.ylabel('Sum Values')
plt.legend()
plt.grid(True)

# Displaying the plot
plt.show()
merged_df1.head(10)
District Name_x District Code K 01 02 03 04 05 06 07 ... Sum_09-12 District Name_y Tests Taken Score=1 Score=2 Score=3 Score=4 Score=5 % Score 1-2 % Score 3-5
0 Abington 10000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 1.0 Abington 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 Academy Of the Pacific Rim Charter Public (Dis... 4120000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 14.0 Academy Of the Pacific Rim Charter Public (Dis... 7.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 Advanced Math and Science Academy Charter (Dis... 4300000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 ... 7.0 Advanced Math and Science Academy Charter (Dis... 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 Amherst-Pelham 6050000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 29.0 Amherst-Pelham 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 Andover 90000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 9.0 Andover 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
5 Arlington 100000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 7.0 Arlington 5.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
6 Ashland 140000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 2.0 Ashland 8.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
7 Assabet Valley Regional Vocational Technical 8010000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 Assabet Valley Regional Vocational Technical 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
8 Atlantis Charter (District) 4910000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 4.0 Atlantis Charter (District) 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
9 Attleboro 160000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 4.0 Attleboro 4.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

10 rows × 29 columns

merged_df1['Percent-District-Afr-Amer'] = merged_df1['All Grades'] * 100 / merged_df1['Total Students']
 
merged_df1.dtypes
District Name_x               object
District Code                  int64
K                            float64
01                           float64
02                           float64
03                           float64
04                           float64
05                           float64
06                           float64
07                           float64
08                           float64
09                           float64
10                           float64
11                           float64
12                           float64
All Grades                   float64
Total Students               float64
Sum_K-3                      float64
Sum_04-08                    float64
Sum_09-12                    float64
District Name_y               object
Tests Taken                  float64
Score=1                      float64
Score=2                      float64
Score=3                      float64
Score=4                      float64
Score=5                      float64
% Score 1-2                  float64
% Score 3-5                  float64
Percent-District-Afr-Amer    float64
dtype: object
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Selecting the required columns
scatter_data = merged_df1[['District Name_x', 'Percent-District-Afr-Amer', 'Tests Taken', '% Score 3-5', '% Score 1-2']]

# Creating scatter plots
sns.set(style="ticks")
sns.pairplot(scatter_data, hue='District Name_x')

# Displaying the plots
plt.show()
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Selecting the required columns
scatter_data = merged_df2[['District Name_x', 'Percent-District-Afr-Amer', 'Tests Taken', '% Score 3-5', '% Score 1-2']]

# Creating scatter plots
sns.set(style="ticks")
sns.pairplot(scatter_data, hue='District Name_x')

# Displaying the plots
plt.show()
County Mapping!
district_county_map = {
    "Abby Kelley Foster Charter Public (District)": "Worcester",
    "Abington": "Plymouth",
    "Academy Of the Pacific Rim Charter Public (District)": "Suffolk",
    "Acton-Boxborough": "Middlesex",
    "Acushnet": "Bristol",
    "Advanced Math and Science Academy Charter (District)": "Middlesex",
    "Agawam": "Hampden",
    "Alma del Mar Charter School (District)": "Bristol",
    "Amesbury": "Essex",
    "Amherst": "Hampshire",
    "Amherst-Pelham": "Hampshire",
    "Andover": "Essex",
    "Argosy Collegiate Charter School (District)": "Suffolk",
    "Arlington": "Middlesex",
    "Ashburnham-Westminster": "Worcester",
    "Ashland": "Middlesex",
    "Assabet Valley Regional Vocational Technical": "Middlesex",
    "Athol-Royalston": "Worcester",
    "Atlantis Charter (District)": "Bristol",
    "Attleboro": "Bristol",
    "Auburn": "Worcester",
    "Avon": "Norfolk",
    "Ayer Shirley School District": "Middlesex",
    "Barnstable": "Barnstable",
    "Baystate Academy Charter Public School (District)": "Hampden",
    "Bedford": "Middlesex",
    "Belchertown": "Hampshire",
    "Bellingham": "Norfolk",
    "Belmont": "Middlesex",
    "Benjamin Banneker Charter Public (District)": "Suffolk",
    "Benjamin Franklin Classical Charter Public (District)": "Bristol",
    "Berkley": "Bristol",
    "Berkshire Arts and Technology Charter Public (District)": "Berkshire",
    "Berkshire Hills": "Berkshire",
    "Berlin-Boylston": "Worcester",
    "Beverly": "Essex",
    "Billerica": "Middlesex",
    "Blackstone Valley Regional Vocational Technical": "Worcester",
    "Blackstone-Millville": "Worcester",
    "Blue Hills Regional Vocational Technical": "Norfolk",
    "Boston": "Suffolk",
    "Boston Collegiate Charter (District)": "Suffolk",
    "Boston Day and Evening Academy Charter (District)": "Suffolk",
    "Boston Green Academy Horace Mann Charter School (District)": "Suffolk",
    "Boston Preparatory Charter Public (District)": "SuffolBostonk",
    "Boston Renaissance Charter Public (District)": "Suffolk",
    "Bourne": "Barnstable",
    "Boxford": "Essex",
    "Braintree": "Norfolk",
    "Brewster": "Barnstable",
    "Bridge Boston Charter School (District)": "Suffolk",
    "Bridgewater-Raynham": "Plymouth",
    "Brimfield": "Hampden",
    "Bristol County Agricultural": "Bristol",
    "Bristol-Plymouth Regional Vocational Technical": "Bristol",
    "Brockton": "Plymouth",
    "Brooke Charter School (District)": "Suffolk",
    "Brookfield": "Worcester",
    "Brookline": "Norfolk",
    "Burlington": "Middlesex",
    "Cambridge": "Middlesex",
    "Canton": "Norfolk",
    "Cape Cod Lighthouse Charter (District)": "Barnstable",
    "Cape Cod Regional Vocational Technical": "Barnstable",
    "Carlisle": "Middlesex",
    "Carver": "Plymouth",
    "Central Berkshire": "Berkshire",
    "Chelmsford": "Middlesex",
    "Chelsea": "Suffolk",
    "Chesterfield-Goshen": "Hampshire",
    "Chicopee": "Hampden",
    "Christa McAuliffe Charter Public (District)": "Middlesex",
    "City on a Hill Charter Public School (District)": "Suffolk",
    "Clarksburg": "Berkshire",
    "Clinton": "Worcester",
    "Codman Academy Charter Public (District)": "Suffolk",
    "Cohasset": "Norfolk",
    "Collegiate Charter School of Lowell (District)": "Middlesex",
    "Community Charter School of Cambridge (District)": "Middlesex",
    "Community Day Charter Public School - Gateway (District)": "Essex",
    "Community Day Charter Public School - Prospect (District)": "Essex",
    "Community Day Charter Public School - R. Kingman Webster (District)": "Essex",
    "Concord": "Middlesex",
    "Concord-Carlisle": "Middlesex",
    "Conservatory Lab Charter (District)": "Suffolk",
    "Conway": "Franklin",
    "Danvers": "Essex",
    "Dartmouth": "Bristol",
    "Dedham": "Norfolk",
    "Deerfield": "Franklin",
    "Dennis-Yarmouth": "Barnstable",
    "Dighton-Rehoboth": "Bristol",
    "Douglas": "Worcester",
    "Dover": "Norfolk",
    "Dover-Sherborn": "Norfolk",
    "Dracut": "Middlesex",
    "Dudley Street Neighborhood Charter School (District)": "Suffolk",
    "Dudley-Charlton Reg": "Worcester",
    "Duxbury": "Plymouth",
    "East Bridgewater": "Plymouth",
    "East Longmeadow": "Hampden",
    "Eastham": "Barnstable",
    "Easthampton": "Hampshire",
    "Easton": "Bristol",
    "Edgartown": "Dukes",
    "Edward M. Kennedy Academy for Health Careers (Horace Mann Charter) (District)": "Suffolk",
    "Erving": "Franklin",
    "Essex North Shore Agricultural and Technical School District": "Essex",
    "Everett": "Middlesex",
    "Excel Academy Charter (District)": "Suffolk",
    "Fairhaven": "Bristol",
    "Fall River": "Bristol",
    "Falmouth": "Barnstable",
    "Farmington River Reg": "Berkshire",
    "Fitchburg": "Worcester",
    "Four Rivers Charter Public (District)": "Franklin",
    "Foxborough": "Norfolk",
    "Foxborough Regional Charter (District)": "Norfolk",
    "Framingham": "Middlesex",
    "Francis W. Parker Charter Essential (District)": "Middlesex",
    "Franklin": "Norfolk",
    "Franklin County Regional Vocational Technical": "Franklin",
    "Freetown-Lakeville": "Plymouth",
    "Frontier": "Franklin",
    "Gardner": "Worcester",
    "Gateway": "Hampshire",
    "Georgetown": "Essex",
    "Gill-Montague": "Franklin",
    "Global Learning Charter Public (District)": "Bristol",
    "Gloucester": "Essex",
    "Grafton": "Worcester",
    "Granby": "Hampshire",
    "Greater Commonwealth Virtual District": "Berkshire",
    "Greater Fall River Regional Vocational Technical": "Bristol",
    "Greater Lawrence Regional Vocational Technical": "Essex",
    "Greater Lowell Regional Vocational Technical": "Middlesex",
    "Greater New Bedford Regional Vocational Technical": "Bristol",
    "Greenfield": "Franklin",
    "Groton-Dunstable": "Middlesex",
    "Hadley": "Hampshire",
    "Halifax": "Plymouth",
    "Hamilton-Wenham": "Essex",
    "Hampden Charter School of Science East (District)": "Hampden",
    "Hampden Charter School of Science West (District)": "Hampden",
    "Hampden-Wilbraham": "Hampden",
    "Hampshire": "Hampshire",
    "Hancock": "Berkshire",
    "Hanover": "Plymouth",
    "Harvard": "Worcester",
    "Hatfield": "Hampshire",
    "Haverhill": "Essex",
    "Hawlemont": "Franklin",
    "Helen Y. Davis Leadership Academy Charter Public (District)": "Suffolk",
    "Hill View Montessori Charter Public (District)": "Essex",
    "Hilltown Cooperative Charter Public (District)": "Hampshire",
    "Hingham": "Plymouth",
    "Holbrook": "Norfolk",
    "Holland": "Hampden",
    "Holliston": "Middlesex",
    "Holyoke": "Hampden",
    "Holyoke Community Charter (District)": "Hampden",
    "Hoosac Valley Regional": "Berkshire",
    "Hopedale": "Worcester",
    "Hopkinton": "Middlesex",
    "Hudson": "Middlesex",
    "Hull": "Plymouth",
    "Innovation Academy Charter (District)": "Essex",
    "Ipswich": "Essex",
    "KIPP Academy Boston Charter School (District)": "Suffolk",
    "KIPP Academy Lynn Charter (District)": "Essex",
    "King Philip": "Norfolk",
    "Kingston": "Plymouth",
    "Lawrence": "Essex",
    "Lawrence Family Development Charter (District)": "Essex",
    "Learning First Charter Public School (District)": "Worcester",
    "Lee": "Berkshire",
    "Leicester": "Worcester",
    "Lenox": "Berkshire",
    "Leominster": "Worcester",
    "Leverett": "Franklin",
    "Lexington": "Middlesex",
    "Libertas Academy Charter School (District)": "Middlesex",
    "Lincoln": "Middlesex",
    "Lincoln-Sudbury": "Middlesex",
    "Littleton": "Middlesex",
    "Longmeadow": "Hampden",
    "Lowell": "Middlesex",
    "Lowell Community Charter Public (District)": "Middlesex",
    "Lowell Middlesex Academy Charter (District)": "Middlesex",
    "Ludlow": "Hampden",
    "Lunenburg": "Worcester",
    "Lynn": "Essex",
    "Lynnfield": "Essex",
    "Malden": "Middlesex",
    "Manchester Essex Regional": "Essex",
    "Mansfield": "Bristol",
    "Marblehead": "Essex",
    "Marion": "Plymouth",
    "Marlborough": "Middlesex",
    "Marshfield": "Plymouth",
    "Martha's Vineyard": "Dukes",
    "Martin Luther King Jr. Charter School of Excellence (District)": "Suffolk",
    "Masconomet": "Essex",
    "Mashpee": "Barnstable",
    "Mass Academy of Math and Science at WPI (District)": "Worcester",
    "Mat-Su Borough": "Anchorage",
    "Matter and Form Charter School (District)": "Suffolk",
    "Mattapoisett": "Plymouth",
    "Maynard": "Middlesex",
    "Medfield": "Norfolk",
    "Medford": "Middlesex",
    "Medway": "Norfolk",
    "Melrose": "Middlesex",
    "Memorial School District": "Bristol",
    "Mendon-Upton": "Worcester",
    "Methuen": "Essex",
    "Middleborough": "Plymouth",
    "Middleton": "Essex",
    "Milford": "Worcester",
    "Millis": "Norfolk",
    "Milton": "Norfolk",
    "Mohawk Trail": "Franklin",
    "Monomoy Regional": "Barnstable",
    "Monson": "Hampden",
    "Montachusett Regional Vocational Technical": "Worcester",
    "Montague": "Franklin",
    "Monterey": "Berkshire",
    "Mount Greylock": "Berkshire",
    "Nahant": "Essex",
    "Nantucket": "Nantucket",
    "Narragansett": "Worcester",
    "Nashoba": "Middlesex",
    "Nashoba Valley Technical": "Middlesex",
    "Nashua River Valley Vocational Technical": "Middlesex",
    "Natick": "Middlesex",
    "Nauset": "Barnstable",
    "Needham": "Norfolk",
    "New Bedford": "Bristol",
    "New England Academy Charter School (District)": "Middlesex",
    "New Heights Charter School of Brockton (District)": "Plymouth",
    "New Salem-Wendell": "Franklin",
    "Newburyport": "Essex",
    "Newton": "Middlesex",
    "North Adams": "Berkshire",
    "North Andover": "Essex",
    "North Attleborough": "Bristol",
    "North Brookfield": "Worcester",
    "North Middlesex": "Middlesex",
    "North Reading": "Middlesex",
    "Northampton": "Hampshire",
    "Northborough": "Worcester",
    "Northbridge": "Worcester",
    "Norton": "Bristol",
    "Norwell": "Plymouth",
    "Norwood": "Norfolk",
    "Old Colony Regional Vocational Technical": "Bristol",
    "Old Rochester": "Plymouth",
    "Orange": "Franklin",
    "Orleans": "Barnstable",
    "Oxford": "Worcester",
    "Palmer": "Hampden",
    "Pathfinder Regional Vocational Technical": "Worcester",
    "Pembroke": "Plymouth",
    "Pentucket": "Essex",
    "Petersham": "Worcester",
    "Pittsfield": "Berkshire",
    "Plainville": "Norfolk",
    "Plymouth": "Plymouth",
    "Plympton": "Plymouth",
    "Provincetown": "Barnstable",
    "Quaboag Regional": "Worcester",
    "Quabbin": "Worcester",
    "Quincy": "Norfolk",
    "Ralph C. Mahar": "Franklin",
    "Randolph": "Norfolk",
    "Reading": "Middlesex",
    "Revere": "Suffolk",
    "Richmond": "Berkshire",
    "Rising Tide Charter Public (District)": "Plymouth",
    "Rochester": "Plymouth",
    "Rockland": "Plymouth",
    "Rockport": "Essex",
    "Rowe": "Franklin",
    "Royalston": "Worcester",
    "Russell": "Hampden",
    "Rutland": "Worcester",
    "Salem": "Essex",
    "Salem Academy Charter (District)": "Essex",
    "Salisbury": "Essex",
    "Sandwich": "Barnstable",
    "Saugus": "Essex",
    "Savoy": "Berkshire",
    "Scituate": "Plymouth",
    "Seekonk": "Bristol",
    "Sharon": "Norfolk",
    "Shawsheen Valley Regional Vocational Technical": "Middlesex",
    "Sherborn": "Middlesex",
    "Shirley": "Middlesex",
    "Shrewsbury": "Worcester",
    "Shutesbury": "Franklin",
    "Silver Lake": "Plymouth",
    "Somerset": "Bristol",
    "Somerville": "Middlesex",
    "South Hadley": "Hampshire",
    "South Middlesex Regional Vocational Technical": "Middlesex",
    "South Shore Charter Public (District)": "Plymouth",
    "South Shore Regional Vocational Technical": "Plymouth",
    "Southborough": "Worcester",
    "Southbridge": "Worcester",
    "Southwick-Tolland-Granville Regional": "Hampden",
    "Spencer-East Brookfield": "Worcester",
    "Springfield": "Hampden",
    "Springfield Conservatory of the Arts (District)": "Hampden",
    "Stoneham": "Middlesex",
    "Stoughton": "Norfolk",
    "Sturbridge": "Worcester",
    "Sudbury": "Middlesex",
    "Sunderland": "Franklin",
    "Sutton": "Worcester",
    "Swampscott": "Essex",
    "Swansea": "Bristol",
    "Tantasqua": "Worcester",
    "Tecumseh": "Lenawee",
    "Tewksbury": "Middlesex",
    "Tisbury": "Dukes",
    "Topsfield": "Essex",
    "Tri-County Regional Vocational Technical": "Norfolk",
    "Tyngsborough": "Middlesex",
    "Uxbridge": "Worcester",
    "Wachusett": "Worcester",
    "Wales": "Hampden",
    "Walpole": "Norfolk",
    "Waltham": "Middlesex",
    "Ware": "Hampshire",
    "Wareham": "Plymouth",
    "Warren": "Worcester",
    "Warwick": "Franklin",
    "Washington": "Berkshire",
    "Watertown": "Middlesex",
    "Wayland": "Middlesex",
    "Webster": "Worcester",
    "Wellesley": "Norfolk",
    "Wellfleet": "Barnstable",
    "West Boylston": "Worcester",
    "West Bridgewater": "Plymouth",
    "West Brookfield": "Worcester",
    "West Springfield": "Hampden",
    "West Stockbridge": "Berkshire",
    "West Tisbury": "Dukes",
    "Westborough": "Worcester",
    "Westfield": "Hampden",
    "Westford": "Middlesex",
    "Westhampton": "Hampshire",
    "Westminster": "Worcester",
    "Weston": "Middlesex",
    "Westport": "Bristol",
    "Westwood": "Norfolk",
    "Weymouth": "Norfolk",
    "Whately": "Franklin",
    "Whitman-Hanson": "Plymouth",
    "Wilbraham": "Hampden",
    "Williamsburg": "Hampshire",
    "Williamstown": "Berkshire",
    "Wilmington": "Middlesex",
    "Winchendon": "Worcester",
    "Winchester": "Middlesex",
    "Windsor": "Berkshire",
    "Winthrop": "Suffolk",
    "Woburn": "Middlesex",
    "Worcester": "Worcester",
    "Worthington": "Hampshire",
    "Wrentham": "Norfolk",
    "Yarmouth": "Barnstable",
    "Florida": "Berkshire",
    "MATCH Charter Public School (District)": "Suffolk",
    "Ma Academy for Math and Science": "Worcester",
    "Map Academy Charter School (District)": "Plymouth",
    "Marblehead Community Charter Public (District)": "Essex",
    "Martha's Vineyard Charter (District)": "Dukes",
    "Millbury": "Worcester",
    "Minuteman Regional Vocational Technical": "Middlesex",
    "Monomoy Regional School District": "Barnstable",
    "Mystic Valley Regional Charter (District)": "Middlesex",
    "Nashoba Valley Regional Vocational Technical": "Middlesex",
    "Neighborhood House Charter (District)": "Suffolk",
    "Norfolk": "Norfolk",
    "Norfolk County Agricultural": "Norfolk",
    "Northampton-Smith Vocational Agricultural": "Hampshire",
    "Northboro-Southboro": "Worcester",
    "Northeast Metropolitan Regional Vocational Technical": "Middlesex",
    "Northern Berkshire Regional Vocational Technical": "Berkshire",
    "Oak Bluffs": "Dukes",
    "Old Sturbridge Academy Charter Public School (District)": "Worcester",
    "Peabody": "Essex",
    "Pelham": "Hampshire",
    "Phoenix Academy Public Charter High School Lawrence (District)": "Essex",
    "Phoenix Academy Public Charter High School Springfield (District)": "Hampden",
    "Phoenix Charter Academy (District)": "Suffolk",
    "Pioneer Charter School of Science (District)": "Middlesex",
    "Pioneer Charter School of Science II (PCSS-II) (District)": "Essex",
    "Pioneer Valley": "Franklin",
    "Pioneer Valley Chinese Immersion Charter (District)": "Hampshire",
    "Pioneer Valley Performing Arts Charter Public (District)": "Hampshire",
    "Prospect Hill Academy Charter (District)": "Middlesex",
    "Ralph C Mahar": "Franklin",
    "River Valley Charter (District)": "Essex",
    "Roxbury Preparatory Charter (District)": "Suffolk",
    "Sizer School: A North Central Charter Essential (District)": "Worcester",
    "Somerset Berkley Regional School District": "Bristol",
    "Southampton": "Hampshire",
    "Southeastern Regional Vocational Technical": "Bristol",
    "Southern Berkshire": "Berkshire",
    "Southern Worcester County Regional Vocational School District": "Worcester",
    "Southwick-Tolland-Granville Regional School District": "Hampden",
    "Spencer-E Brookfield": "Worcester",
    "Springfield International Charter (District)": "Hampden",
    "Springfield Preparatory Charter School (District)": "Hampden",
    "Sturgis Charter Public (District)": "Barnstable",
    "TEC Connections Academy Commonwealth Virtual School District": "Norfolk",
    "Taunton": "Bristol",
    "Triton": "Essex",
    "Truro": "Barnstable",
    "UP Academy Charter School of Boston (District)": "Suffolk",
    "UP Academy Charter School of Dorchester (District)": "Suffolk",
    "Up-Island Regional": "Dukes",
    "Upper Cape Cod Regional Vocational Technical": "Barnstable",
    "Veritas Preparatory Charter School (District)": "Hampden",
    "Wakefield": "Middlesex",
    "Whittier Regional Vocational Technical": "Essex"
  }
County Work!
merged_df1['County'] = merged_df1['District Name_x'].map(district_county_map)
merged_df2['County'] = merged_df2['District Name_x'].map(district_county_map)
merged_df1
District Name_x District Code K 01 02 03 04 05 06 07 ... Tests Taken Score=1 Score=2 Score=3 Score=4 Score=5 % Score 1-2 % Score 3-5 Percent-District-Afr-Amer County
0 Abington 10000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.900901 Plymouth
1 Academy Of the Pacific Rim Charter Public (Dis... 4120000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 7.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 4.560261 Suffolk
2 Advanced Math and Science Academy Charter (Dis... 4300000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 ... 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 47.619048 Middlesex
3 Amherst-Pelham 6050000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 26.851852 Hampshire
4 Andover 90000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 6.474820 Essex
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
155 Whitman-Hanson 7800000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 4.411765 Plymouth
156 Whittier Regional Vocational Technical 8850000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 Essex
157 Woburn 3470000 0.0 0.0 0.0 0.0 0.0 0.0 14.0 15.0 ... 4.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 9.687500 Middlesex
158 Worcester 3480000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 72.0 41.0 10.0 11.0 8.0 2.0 70.8 29.2 1.796407 Worcester
159 State Totals 0 0.0 0.0 10.0 23.0 10.0 21.0 85.0 232.0 ... 1140.0 559.0 246.0 191.0 92.0 52.0 70.6 29.4 3.427531 NaN

160 rows × 31 columns

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Selecting the required columns
scatter_data = merged_df1[['County', 'Percent-District-Afr-Amer', 'Tests Taken', '% Score 3-5', '% Score 1-2']]

# Creating scatter plots
sns.set(style="ticks")
sns.pairplot(scatter_data, hue='County')

# Displaying the plots
plt.show()
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Selecting the required columns
scatter_data = merged_df2[['County', 'Percent-District-Afr-Amer', 'Tests Taken', '% Score 3-5', '% Score 1-2']]

# Creating scatter plots
sns.set(style="ticks")
sns.pairplot(scatter_data, hue='County')

# Displaying the plots
plt.show()
# df1.isnull().sum() # column wise null check

# df1['All Grades'] = df1['All Grades'].str.replace(',', '').astype('float64')
# df1['Total Students'] = df1['Total Students'].str.replace(',', '').astype('float64')
# merged_df1_i = merged_df1_i.drop(['CS course Binary for Sum_K-3', 'CS course Binary for Sum_04-08', 'CS course Binary for Sum_09-12'], axis=1)