This data notebook contains the analysis that generated facts in the story “Federal Title IX data on sports participation is unreliable” from the series “Unlevel Playing Fields”. For each sentence in the story generated by original data analysis, I have provided the corresponding code and results.
## Libraries
# For general data science
library(tidyverse)
# For data cleaning
library(janitor)
# For loading Excel files
library(readxl)
# For working with datetime
library(lubridate)
# For U.S. Census Bureau data
library(tigris)
# Avoid use of scientific notation
options(scipen = 999)
# For pretty tables
library(kableExtra)
library(knitr)
## Functions
# Function for loading and transforming districts with 2017_18 data only
districts <- function(county, lea) {
path=paste0("../data/processed/md/",county,"_2017_18.csv")
district_athletics <- read_csv(path) %>%
mutate_if(is.numeric, ~replace(., is.na(.), 0))
district_sums_without_cau <- district_athletics %>%
filter(is.na(type)) %>%
group_by(county, year) %>%
summarise(boys_total_participation = sum(boys_participants), girls_total_participation = sum(girls_participants)) %>%
select(county, year, boys_total_participation, girls_total_participation)
district_sums_all <- district_athletics %>%
group_by(county, year) %>%
summarise(boys_total_participation_cs = sum(boys_participants), girls_total_participation_cs = sum(girls_participants)) %>%
select(county, year, boys_total_participation_cs, girls_total_participation_cs)
district_final <- district_sums_without_cau %>%
left_join(district_sums_all) %>%
mutate(leaid = lea, .before=1)
}
# Function for loading and transforming districts with all years
districts_all_years <- function(county, lea) {
path=paste0("../data/processed/md/",county,"_all_years.csv")
district_athletics <- read_csv(path) %>%
filter(year == "2017-18") %>%
mutate_if(is.numeric, ~replace(., is.na(.), 0))
district_sums_without_cau <- district_athletics %>%
filter(is.na(type)) %>%
group_by(county, year) %>%
summarise(boys_total_participation = sum(boys_participants), girls_total_participation = sum(girls_participants)) %>%
select(county, year, boys_total_participation, girls_total_participation)
district_sums_all <- district_athletics %>%
group_by(county, year) %>%
summarise(boys_total_participation_cs = sum(boys_participants), girls_total_participation_cs = sum(girls_participants)) %>%
select(county, year, boys_total_participation_cs, girls_total_participation_cs)
district_final <- district_sums_without_cau %>%
left_join(district_sums_all) %>%
mutate(leaid = lea, .before=1)
}
# Function to calculate the proportion of athletes and enrollment that boys and girls represent in multiple datasets. Also calculates the gender gap using those proportions.
gender_gap <- function(df, male_enrollment, female_enrollment, total_enrollment, boys_sports, girls_sports, total_sports, rounding_digits) {
x <<- df %>%
# Calculate percentage of student body that is male and proportion that is female
mutate(boys_enroll_percent = round(((!!!syms(male_enrollment))/(!!!syms(total_enrollment))*100),rounding_digits), girls_enroll_percent = round(((!!!syms(female_enrollment))/(!!!syms(total_enrollment))*100),rounding_digits)) %>%
# Calculate percentage of athletes that are male and proportion that are female
mutate(boys_sports_percent = round(((!!!syms(boys_sports))/(!!!syms(total_sports))*100),rounding_digits), girls_sports_percent = round(((!!!syms(girls_sports))/(!!!syms(total_sports))*100),rounding_digits)) %>%
# Subtract enrollment percentage from athlete percentage to find the gender gap
mutate(gender_gap = boys_sports_percent-boys_enroll_percent)
}
# Function for formatted table output
output_formatted_table = function(table, text){
table %>%
kable(caption = text) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), font_size = 14, fixed_thead = T) %>%
scroll_box(width = "100%")
}
Since 2000, the U.S. Department of Education’s Office for Civil Rights has required K-12 schools to submit data on various programs, including athletics, through the Civil Rights Data Collection. The survey typically occurs every two years, however, the cycle was disrupted by the COVID-19 pandemic. At the time of reporting, the latest available CRDC data was from the 2017-18 school year.
The specific CRDC elements I used from the CRDC were from the single-sex athletics questions and enrollment variables.
## CRDC athletic participation data
crdc_2017_18 <- read.csv("../data/source/civil_rights_data_collection_OCR/2017-18-crdc-data-corrected-publication-2/data/crdc_athletics_2017_18.csv") %>%
mutate(SCH_SSATHLETICS_IND = na_if(SCH_SSATHLETICS_IND, -9)) %>%
mutate(across(where(is.numeric), ~na_if(., "-9"))) %>%
clean_names() %>%
mutate(lea_name = case_when(
lea_name == "Washingtion County Public Schools" ~ "Washington County Public Schools",
TRUE ~ lea_name
))
## CRDC enrollment data
md_crdc_enroll <- read_csv("../data/source/civil_rights_data_collection_OCR/2017-18-crdc-data-corrected-publication-2/data/crdc_enrollment_2017_18.csv") %>%
clean_names() %>%
filter(lea_state == "MD") %>%
select(combokey, tot_enr_m, tot_enr_f)
## Join CRDC athletic participation and enrollment data
# Remove schools without single-sex athletics. This includes elementary and middle schools, juvenile justice facilities and some specialized schools. Also filter to Maryland schools.
md_schools_crdc <- crdc_2017_18 %>%
filter(sch_ssathletics_ind == "Yes", lea_state == "MD") %>%
# The combokeys for Maryland schools are not correct. In other CRDC datasets, the combokeys are generated by pasting together the leaid and the 5-digit school id. Create a corrected combokey here so that we can use these unique identifiers to join with enrollment data for proportionality calculations.
mutate(combokey_correct = paste0(leaid, str_pad(schid, width = 5, side = "left", pad = "0"))) %>%
# Join Maryland athletics data with enrollment data. Using a left join gets rid of schools keeps only relevant schools.
left_join(md_crdc_enroll, by = c("combokey_correct" = "combokey")) %>%
# Create a total enrollment column
mutate(tot_enr = tot_enr_m + tot_enr_f)
## Calculate districtwide sums from Maryland schools in CRDC
# From CRDC data, group by district (leaid) and sum the boys participation across high schools and the girls participation across high schools
md_crdc <- md_schools_crdc %>%
group_by(leaid, lea_state, lea_name) %>%
summarise(boys_sum = sum(sch_sspart_m), girls_sum = sum(sch_sspart_f), enr_m = sum(tot_enr_m), enr_f = sum(tot_enr_f), enr_total = sum(tot_enr))
To check if federal data reflected reality, my colleagues and I filed public records requests for athletics participation by gender at each of Maryland’s 24 public school districts. We received usable data from 20 districts. For most districts, we received five years of data, but I focused my analysis on 2017-18 because that was the latest available for federal data.
Baltimore City Public Schools provided data that came from the Office for Civil Rights, so it could not be compared with the type of data we received from other districts.Prince George’s County Public Schools, Queen Anne’s County Public Schools and Talbot County Public Schools did not fulfill our public records requests.
In examining Somerset County Public Schools’ data, I found that the district had recorded 2017-18 cross country boys participation as 116 when it should have been 16. I corrected that within the district data, but the athletic director could not confirm if that error was carried over to the district’s federal data reporting. So the CRDC data for Somerset has been left unchanged.
Most districts provided the athletics data from annual forms they submit to the state athletics association. My colleagues and I extracted the data from those pdfs into csvs. Then I wrote the districts functions (above) to transform the data to a structure that could be compared with federal data.
## Run functions to load and transform athletics data from each district
allegany = districts_all_years("allegany", "2400030")
anne_arundel = districts_all_years("anne_arundel", "2400060")
baltimore_county = districts_all_years("baltimore_county", "2400120")
calvert = districts_all_years("calvert", "2400150")
carroll = districts_all_years("carroll", "2400210")
cecil = districts_all_years("cecil", "2400240")
charles = districts_all_years("charles", "2400270")
dorchester = districts_all_years("dorchester", "2400300")
garrett = districts_all_years("garrett", "2400360")
harford = districts_all_years("harford", "2400390")
howard = districts_all_years("howard", "2400420")
kent = districts("kent", "2400450")
somerset = districts_all_years("somerset", "2400570")
st_marys = districts_all_years("st_marys", "2400600")
washington = districts_all_years("washington", "2400660")
wicomico = districts_all_years("wicomico", "2400690")
worcester = districts_all_years("worcester", "2400720")
A few districts provided more detailed in-house data which requires customized transformation scripts.
## Caroline County --------------------
caroline <- read_csv("../data/processed/md/caroline_all_years.csv") %>%
mutate_if(is.numeric, ~replace(., is.na(.), 0)) %>%
group_by(county, year) %>%
summarise(boys_colrich = sum(boys_participants_colonel_richardson), girls_colrich = sum(girls_participants_colonel_richardson), boys_northcar = sum(boys_participants_north_caroline), girls_northcar = sum(girls_participants_north_caroline)) %>%
# Add totals from each school. Set corollary sports equal to 0 because we don't have that data from this district
mutate(boys_total_participation = boys_colrich+boys_northcar, girls_total_participation = girls_colrich+girls_northcar, leaid = "2400180", boys_total_participation_cs = 0, girls_total_participation_cs = 0) %>%
select(leaid, county, year, boys_total_participation, girls_total_participation, boys_total_participation_cs, girls_total_participation_cs) %>%
filter(year == "2017-18")
## Frederick County --------------------
frederick <- read_csv("../data/processed/md/frederick_2017_18.csv") %>%
rename(school = 1) %>%
slice(1:10) %>%
mutate(leaid = "2400330", county = "Frederick", year = "2017-18", .before=1) %>%
group_by(leaid, county, year) %>%
summarise(boys_total_participation = sum(male_athletes), girls_total_participation = sum(female_athletes)) %>%
# Set corollary sports equal to 0 because we don't have that data from this district
mutate(boys_total_participation_cs = 0, girls_total_participation_cs = 0)
## Montgomery County --------------------
# Read in the key that the school provided to match school names, numbers and abbreviations
hs_key <- read_excel("../data/source/maryland_school_districts/montgomery_county_ps/fy22_mcps_high_school_name_key.xlsx", col_names = c("alpha", "school_number", "school_abbreviation", "school_short", "school_long", "school"), skip = 1) %>%
mutate(school_number = as.character(school_number)) %>%
select(2,3,5) %>%
# Add "High" to the end of the school names to match CRDC data
mutate(school_crdc = paste(school_long,'High', sep=" ")) %>%
# Replace Bethesda Chevy Chase abbreviation with the abbreviation that is in the school's participation data
mutate(school_abbreviation = case_when(
school_abbreviation == "BC" ~ "BCC",
TRUE ~ school_abbreviation
)) %>%
# Add "School" to "Northwood High" to match the school's name in the Civil Rights Data Collection data with which this will later be joined
mutate(school_crdc = case_when(
school_crdc == "Northwood High" ~ "Northwood High School",
TRUE ~ school_crdc
))
# I found that the district's year-long totals had errors in some calculations, making it necessary to read in each season's data and calculate totals
# Load fall 2017 data
mcps_athletics_fall_2017_18 <- read_excel("../data/source/maryland_school_districts/montgomery_county_ps/mcps_athletics_participation_2017_18.xlsx", sheet = 1, col_names = c("school", "cc_v_boys", "cc_v_girls", "cc_v_tot", "fh_v_girls", "fh_jv_girls", "fh_tot_girls", "fb_v_boys", "fb_jv_boys", "fb_tot_boys", "golf_v_boys", "golf_v_girls", "golf_v_tot", "soc_v_boys", "soc_jv_boys", "soc_tot_boys", "soc_v_girls", "soc_jv_girls", "soc_tot_girls", "tennis_v_girls", "vb_v_girls", "vb_jv_girls", "vb_tot_girls", "hand_v_boys", "hand_v_girls", "hand_v_tot", "varsity_total_boys", "varsity_total_girls", "varsity_total", "jv_total_boys", "jv_total_girls", "jv_total", "total_participation_boys", "total_participation_girls", "total_participation", "cheer_v_boys", "cheer_jv_boys", "cheer_v_girls", "cheer_jv_girls", "cheer_total", "poms_v_girls", "cheer_poms_tot_boys", "cheer_poms_tot_girls", "cheer_pom_total", "grand_total_all"), skip = 3) %>%
filter(school != "TOTAL" & school !="MOST" & school !="LEAST" & school !="AVG/25") %>%
mutate(cc_v_boys = as.numeric(cc_v_boys))
# Calculate fall 2017 total participation by gender
mcps_fall_totals <- mcps_athletics_fall_2017_18 %>%
# Calculate boys total without handball, which is a corollary sport, and excluding cheer and poms (?)
mutate(boys_fall_tot_part = cc_v_boys + fb_v_boys + fb_jv_boys + golf_v_boys + soc_v_boys + soc_jv_boys) %>%
# Calculate girls total without handball, which is a corollary sport, and excluding cheer and poms (?)
mutate(girls_fall_tot_part = cc_v_girls + fh_v_girls + fh_jv_girls + golf_v_girls + soc_v_girls + soc_jv_girls + tennis_v_girls + vb_v_girls + vb_jv_girls) %>%
# Calculate boys total with corollary sports (handball)
mutate(boys_fall_tot_part_cs = boys_fall_tot_part + hand_v_boys) %>%
# Calculate girls total with corollary sports (handball)
mutate(girls_fall_tot_part_cs = girls_fall_tot_part + hand_v_girls)
# Load winter 2017-18 participation data
mcps_athletics_winter_2017_18 <- read_excel("../data/source/maryland_school_districts/montgomery_county_ps/mcps_athletics_participation_2017_18.xlsx", sheet = 2, col_names = c("school", "bball_v_boys", "bball_jv_boys", "bball_tot_boys", "bball_v_girls", "bball_jv_girls", "bball_tot_girls", "intrk_v_boys", "intrk_v_girls", "intrk_v_tot", "swim_v_boys", "swim_v_girls", "swim_v_tot", "wrestlg_v_boys", "wrestlg_jv_boys", "wrestlg_tot_girls", "wrestlg_tot", "bocce_v_boys", "bocce_v_girls", "bocce_tot", "varsity_total_boys", "varsity_total_girls", "varsity_total", "jv_total_boys", "jv_total_girls","jv_total", "total_participation_boys", "total_participation_girls", "total_participation", "cheer_v_boys", "cheer_v_girls", "cheer_jv_girls", "cheer_total", "poms_v_girls", "cheer_poms_tot_boys", "cheer_poms_tot_girls", "cheer_pom_total", "grand_total_all"), skip = 3) %>%
filter(school != "TOTAL" & school !="MOST" & school !="LEAST" & school !="AVG/25" & school !="NA") %>%
mutate(bball_jv_boys = as.numeric(bball_jv_boys))
# Calculate winter total participation by gender
mcps_winter_totals <- mcps_athletics_winter_2017_18 %>%
# Calculate boys total without bocce, which is a corollary sport, and excluding cheer and poms (?)
mutate(boys_winter_tot_part = bball_v_boys + bball_jv_boys + intrk_v_boys + swim_v_boys + wrestlg_v_boys + wrestlg_jv_boys)%>%
# Calculate girls total without bocce, which is a corollary sport, and excluding cheer and poms (?)
mutate(girls_winter_tot_part = bball_v_girls + bball_jv_girls + intrk_v_girls + swim_v_girls + wrestlg_tot_girls) %>%
# Calculate boys total with corollary sports (bocce)
mutate(boys_winter_tot_part_cs = boys_winter_tot_part + bocce_v_boys) %>%
# Calculate girls total with corollary sports (bocce)
mutate(girls_winter_tot_part_cs = girls_winter_tot_part + bocce_v_girls)
# Load spring 2018 participation data
mcps_athletics_spring_2017_18 <- read_excel("../data/source/maryland_school_districts/montgomery_county_ps/mcps_athletics_participation_2017_18.xlsx", sheet = 3, col_names = c("school", "base_v_boys", "base_jv_boys", "base_tot_boys", "gym_v_girls", "lax_v_boys", "lax_jv_boys", "lax_tot_boys", "lax_v_girls", "lax_jv_girls", "lax_tot_girls", "soft_v_girls", "soft_jv_girls", "soft_tot_girls", "tennis_v_boys", "trk_v_boys", "trk_v_girls", "trk_tot", "vball_v_boys", "vball_co_v_boys", "vball_co_girls", "vball_co_tot", "allied_soft_v_boys", "allied_soft_v_girls", "allied_soft_tot", "varsity_total_boys", "varsity_total_girls", "varsity_total", "jv_total_boys", "jv_total_girls", "jv_total", "total_participation_boys", "total_participation_girls", "total_participation"), skip = 3) %>%
filter(school != "TOTAL" & school !="MOST" & school !="LEAST" & school !="AVG/25" & school !="NA") %>%
mutate(base_v_boys = as.numeric(base_v_boys))
# Calculate spring total participation by gender
mcps_spring_totals <- mcps_athletics_spring_2017_18 %>%
# Calculate boys total without allied softball, which is a corollary sport
mutate(boys_spring_tot_part = base_v_boys + base_jv_boys + lax_v_boys + lax_jv_boys + tennis_v_boys + trk_v_boys + vball_v_boys + vball_co_v_boys) %>%
# Calculate girls total without allied softball, which is a corollary sport
mutate(girls_spring_tot_part = gym_v_girls + lax_v_girls + lax_jv_girls + soft_v_girls + soft_jv_girls + trk_v_girls + vball_co_girls) %>%
# Calculate boys total with corollary sports (allied softball)
mutate(boys_spring_tot_part_cs = boys_spring_tot_part + allied_soft_v_boys) %>%
# Calculate girls total with corollary sports (allied softball)
mutate(girls_spring_tot_part_cs = girls_spring_tot_part + allied_soft_v_girls)
# Create dataframes that includes only the columns needed for calculating the year-long totals
# Fall
mcps_fall_totals_clean <- mcps_fall_totals %>%
select(school, boys_fall_tot_part, girls_fall_tot_part, boys_fall_tot_part_cs, girls_fall_tot_part_cs)
# Winter
mcps_winter_totals_clean <- mcps_winter_totals %>%
select(school, boys_winter_tot_part, girls_winter_tot_part, boys_winter_tot_part_cs, girls_winter_tot_part_cs)
# Spring
mcps_spring_totals_clean <- mcps_spring_totals %>%
select(school, boys_spring_tot_part, girls_spring_tot_part, boys_spring_tot_part_cs, girls_spring_tot_part_cs)
# Join the fall, winter, spring dataframes
mcps_totals_2017_18 <- mcps_fall_totals_clean %>%
inner_join(mcps_winter_totals_clean) %>%
inner_join(mcps_spring_totals_clean) %>%
# Add the boys totals without corollary sports
mutate(boys_total_participation = boys_fall_tot_part+boys_winter_tot_part+boys_spring_tot_part) %>%
# Add the girls total without corollary sports
mutate(girls_total_participation = girls_fall_tot_part + girls_winter_tot_part + girls_spring_tot_part) %>%
# Add the boys totals without corollary sports
mutate(boys_total_participation_cs = boys_fall_tot_part_cs + boys_winter_tot_part_cs + boys_spring_tot_part_cs) %>%
# Add the girls totals without corollary sports
mutate(girls_total_participation_cs = girls_fall_tot_part_cs + girls_winter_tot_part_cs + girls_spring_tot_part_cs)
# Select only the participation columns needed for comparing to CRDC data
montgomery <- mcps_totals_2017_18 %>%
select(school, boys_total_participation, girls_total_participation, boys_total_participation_cs, girls_total_participation_cs) %>%
# Separate the school abbreviation and number
separate(school, c('school_abbreviation', 'school_number')) %>%
# Join with high school key for names
inner_join(hs_key) %>%
# Remove extra columns from high school key
select(8,1:6) %>%
# Rename school name column to match CRDC format
rename(sch_name = school_crdc) %>%
# Add a column to identify the year to which this data applies
mutate(year = "2017-18", .before = sch_name) %>%
group_by(year) %>%
summarise(boys_total_participation = sum(boys_total_participation), girls_total_participation = sum(girls_total_participation), boys_total_participation_cs = sum(boys_total_participation_cs), girls_total_participation_cs = sum(girls_total_participation_cs)) %>%
mutate(leaid = "2400480", county = "Montgomery", .before=1)
After loading and transforming the data from each district, I bound all of it into one data frame for comparison with federal data.
md_districts <- rbind(allegany, anne_arundel, baltimore_county, calvert, caroline, carroll, cecil, charles, dorchester, frederick, garrett, harford, howard, kent, montgomery, somerset, st_marys, washington, wicomico, worcester) %>%
mutate(county = tolower(county))
Most of the analysis looked at athletic participation totals by gender. For two examples from specific districts, I needed to look at details about male and female athletes disaggregated by sport.
# Calvert County Public Schools --------------------
calvert_2017_18 <- read_csv("../data/processed/md/calvert_all_years.csv") %>%
filter(year == "2017-18") %>%
mutate_if(is.numeric, ~replace(., is.na(.), 0)) %>%
mutate(sport = tolower(sport)) %>%
# Filter to only non-corollary sports
filter(is.na(type))
# Caroline County Public Schools --------------------
caroline_2017_18 <- read_csv("../data/processed/md/caroline_all_years.csv") %>%
mutate_if(is.numeric, ~replace(., is.na(.), 0)) %>%
mutate(sport = tolower(sport)) %>%
filter(year == "2017-18") %>%
# Get totals of boys and girls by sport
group_by(sport) %>%
summarise(boys = sum(boys_participation), girls = sum(girls_participation))
I acquired high school enrollment data from the Maryland State Department of Education to be used in comparing district athletics participation rates to enrollment by gender.
# Load 2017-18 school district enrollment for grades 9-12
enroll_2017_18 <- read_csv("../data/source/msde/msde_enrollment_2017_18.csv") %>%
mutate(county = tolower(county)) %>%
select(1:5)
# Check that the enrollment total column is accurate
#enroll_check_totals <- read_csv("../data/source/msde/msde_enrollment_2017_18.csv") %>%
#mutate(tot_check = male_9_12 + female_9_12) %>%
#mutate(status = ifelse(tot_check == enrollment_9_12, TRUE, FALSE))
“In a number of cases, figures collected by the Education Department’s Office for Civil Rights are incomplete and differ substantially from statistics kept by school districts, an analysis by the Shirley Povich Center for Sports Journalism and Howard Center for Investigative Journalism at the University of Maryland shows.”
I compared the raw numbers of athletes of each gender in district data to the same numbers in federal data. In most cases, the number of athletes was higher in district data than federal data. And the discrepancy was usually larger for boys than girls.
District data also included totals of athletes playing corollary sports, so I checked if including those participants accounted for the discrepancies. It did not. In fact, since most district totals were higher than federal totals, adding corollary sports widened the discrepancies between district and federal numbers.
# Join district data to Maryland sums. We have district data for 20 districts. CRDC data includes 24 districts and three specialized schools, so in this join we lose seven rows from CRDC data.
districts_compare <- md_districts %>%
inner_join(md_crdc) %>%
# Create new columns that compare district totals to CRDC participation totals by gender without corollary sports
mutate(boys_check = ifelse(boys_sum == boys_total_participation, TRUE, FALSE), girls_check = ifelse(girls_sum == girls_total_participation, TRUE, FALSE)) %>%
# Create new columns that compare district totals to CRDC participation totals by gender with corollary sports
mutate(boys_check_cs = ifelse(boys_sum == boys_total_participation_cs, TRUE, FALSE), girls_check_cs = ifelse(girls_sum == girls_total_participation_cs, TRUE, FALSE)) %>%
# Calculate differences between district totals and CRDC participation totals by gender without corollary sports
mutate(boys_diff = boys_total_participation - boys_sum, girls_diff = girls_total_participation - girls_sum) %>%
# Calculate differences between district totals and CRDC participation totals by gender without corollary sports
mutate(boys_diff_cs = boys_total_participation_cs - boys_sum, girls_diff_cs = girls_total_participation_cs - girls_sum)
# The only instance where a district total matches the federal total is for male athletes in Howard County
check <- districts_compare %>%
filter(boys_check == TRUE | girls_check == TRUE | boys_check_cs == TRUE | girls_check_cs == TRUE)
output_formatted_table(districts_compare$boys_diff, "Difference between boys athletic participation totals in district and federal data")
x |
---|
49 |
2969 |
7474 |
563 |
83 |
1073 |
889 |
1414 |
-70 |
217 |
106 |
33 |
0 |
174 |
1121 |
32 |
115 |
554 |
365 |
96 |
output_formatted_table(districts_compare$girls_diff, "Difference between girls athletic participation totals in district and federal data")
x |
---|
20 |
1681 |
4526 |
-11 |
1 |
753 |
348 |
1175 |
-78 |
-761 |
16 |
-76 |
2 |
110 |
280 |
57 |
50 |
3 |
21 |
16 |
output_formatted_table(check, "The only instance where a district total matches the federal total is for male athletes in Howard County.")
leaid | county | year | boys_total_participation | girls_total_participation | boys_total_participation_cs | girls_total_participation_cs | lea_state | lea_name | boys_sum | girls_sum | enr_m | enr_f | enr_total | boys_check | girls_check | boys_check_cs | girls_check_cs | boys_diff | girls_diff | boys_diff_cs | girls_diff_cs |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2400420 | howard | 2017-18 | 5395 | 4066 | 5651 | 4159 | MD | Howard County Public Schools | 5395 | 4064 | 8729 | 8395 | 17124 | TRUE | FALSE | FALSE | FALSE | 0 | 2 | 256 | 95 |
“The Povich and Howard centers analyzed athletics participation at the 20 Maryland public school districts that provided usable data and found that, in all but four cases, federal data describes a more favorable situation for female athletes than what the districts’ own records show.”
Under Title IX regulations, equity in high school sports participation is measured by comparing the percentages of male and female athletes at a school with the percentages of male and female students overall. I calculated those percentages in both the district and federal datasets. To determine the gender gap, I subtracted the boys enrollment percentage from the boys sports percentage. (The absolute values are the same whether you use boys or girls data for the subtraction. That’s also the case if you reverse the order of the variables in the subtraction.) Both steps were accomplished with the gender_gap function I defined at the start of this notebook.
Then, for each district I subtracted the gender gap that federal data shows from the gender gap that district records show. Most districts had a larger gender gap according to district records than federal data shows. Four districts did not follow this pattern: Somerset, Charles, Allegany and Howard. See note about reliability of Somerset data in the data-loading section of this notebook.
## Calculate proportionality in each dataset
# Join district athletics participation data with enrollment data
districts_sports_and_enrollment <- md_districts %>%
select(1:5) %>%
mutate(total_participation = boys_total_participation + girls_total_participation) %>%
left_join(enroll_2017_18)
# Get proportionality and gender gap for each district based on district data
prop_districts = gender_gap(districts_sports_and_enrollment, "male_9_12", "female_9_12", "enrollment_9_12", "boys_total_participation", "girls_total_participation", "total_participation", 1) %>%
rename(gender_gap_district = gender_gap)
# Get proportionality and gender gap for each district based on CRDC data
prop_crdc <- md_crdc %>%
mutate(crdc_total_sports = boys_sum+girls_sum) %>%
gender_gap("enr_m", "enr_f", "enr_total", "boys_sum", "girls_sum", "crdc_total_sports", 1) %>%
rename(gender_gap_federal = gender_gap)
## Compare gender gaps for each district in the two datasets
compare_gender_gap <- prop_districts %>%
inner_join(prop_crdc, by = c("leaid")) %>%
select(leaid, year, county, lea_name, gender_gap_district, gender_gap_federal) %>%
mutate(diff = gender_gap_district-gender_gap_federal)
## Filter for districts where the difference between the gender gaps is negative. These are districts where the federal data does NOT describe a more favorable situation for girls than what district records show.
outliers <- compare_gender_gap %>%
filter(diff <0)
output_formatted_table(compare_gender_gap, "Most districts analyzed had a larger gender gap according to district records than federal data shows.")
leaid | year | county | lea_name | gender_gap_district | gender_gap_federal | diff |
---|---|---|---|---|---|---|
2400030 | 2017-18 | allegany | Allegany County Public Schools | 7.7 | 8.0 | -0.3 |
2400060 | 2017-18 | anne arundel | Anne Arundel County Public Schools | 5.9 | -1.0 | 6.9 |
2400120 | 2017-18 | baltimore | Baltimore County Public Schools | 4.8 | -22.8 | 27.6 |
2400150 | 2017-18 | calvert | Calvert County Public Schools | 5.8 | -5.5 | 11.3 |
2400180 | 2017-18 | caroline | Caroline County Public Schools | 7.7 | 3.8 | 3.9 |
2400210 | 2017-18 | carroll | Carroll County Public Schools | 7.6 | 7.5 | 0.1 |
2400240 | 2017-18 | cecil | Cecil County Public Schools | 3.6 | -10.3 | 13.9 |
2400270 | 2017-18 | charles | Charles County Public Schools | 4.4 | 7.8 | -3.4 |
2400300 | 2017-18 | dorchester | Dorchester County Public Schools | 5.7 | 4.5 | 1.2 |
2400330 | 2017-18 | frederick | Frederick County Public Schools | 3.6 | -6.2 | 9.8 |
2400360 | 2017-18 | garrett | Garrett County Public Schools | 8.5 | 3.9 | 4.6 |
2400390 | 2017-18 | harford | Harford County Public Schools | 4.5 | 3.8 | 0.7 |
2400420 | 2017-18 | howard | Howard County Public Schools | 5.9 | 6.0 | -0.1 |
2400450 | 2017-18 | kent | Kent County Public Schools | 3.0 | -6.2 | 9.2 |
2400480 | 2017-18 | montgomery | Montgomery County Public Schools | 4.5 | 2.7 | 1.8 |
2400570 | 2017-18 | somerset | Somerset County Public Schools | 9.6 | 15.5 | -5.9 |
2400600 | 2017-18 | st. mary’s | St. Mary’s County Public Schools | 3.6 | 2.9 | 0.7 |
2400660 | 2017-18 | washington | Washington County Public Schools | 6.0 | -4.0 | 10.0 |
2400690 | 2017-18 | wicomico | Wicomico County Public Schools | 6.7 | -2.0 | 8.7 |
2400720 | 2017-18 | worcester | Worcester County Public Schools | 6.2 | 3.8 | 2.4 |
output_formatted_table(outliers, "Four districts did not follow this pattern: Somerset, Charles, Allegany and Howard.")
leaid | year | county | lea_name | gender_gap_district | gender_gap_federal | diff |
---|---|---|---|---|---|---|
2400030 | 2017-18 | allegany | Allegany County Public Schools | 7.7 | 8.0 | -0.3 |
2400270 | 2017-18 | charles | Charles County Public Schools | 4.4 | 7.8 | -3.4 |
2400420 | 2017-18 | howard | Howard County Public Schools | 5.9 | 6.0 | -0.1 |
2400570 | 2017-18 | somerset | Somerset County Public Schools | 9.6 | 15.5 | -5.9 |
“In fact, at 40% of districts, federal data indicates that when compared to their proportion of enrollment, girls outnumber boys in sports. But schools’ own data tells a different story: All districts have fewer opportunities for female athletes.”
In my gender gap calculations, a negative number indicates that girls’ percentage of athletes is higher than their proportion of enrollment, meaning they are overrepresented in sports. When looking at federal data, that scenario occurs in 8 of 20 districts (40%). When looking at district data, it never occurs.
# Make a new dataframe from gender gap comparison
girls_outnumber <- compare_gender_gap %>%
# Create new columns showing whether girls outnumber boys, proportionally, in the federal data and district data
mutate(g_on_district = ifelse(gender_gap_district < 0, TRUE, FALSE), g_on_federal = ifelse(gender_gap_federal < 0, TRUE, FALSE))
# Define a variable that is the percentage of districts where federal data suggests girls outnumber boys in sports
percent_overrep_fed = sum(girls_outnumber$g_on_federal == TRUE)/nrow(girls_outnumber)*100
# Define a variable that is the percentage of districts where district data suggests girls outnumber boys in sports
percent_overrep_district = sum(girls_outnumber$g_on_district == TRUE)/nrow(girls_outnumber)*100
output_formatted_table(percent_overrep_fed, "Percent of districts where federal data suggests girls are overreprsented in sports")
x |
---|
40 |
output_formatted_table(percent_overrep_district, "Percent of districts where district data suggests girls are overreprsented in sports")
x |
---|
0 |
“In the large suburban district of Baltimore County, for example, federal data paints a picture of sports fields teeming with female athletes. The Office for Civil Rights’ public website says that, as of the 2017-18 school year, girls comprised 49% of enrollment in the district and 72% of athletes — making girls overrepresented in sports by 23 percentage points.
But the district’s in-house athletics data, which the Povich and Howard centers obtained through a public records request, says that girls actually comprised 44% of athletes, meaning they were underrepresented in sports by about five percentage points.”
In my gender gap calculations, a negative number indicates that girls’ percentage of athletes is higher than their proportion of enrollment, meaning they are overrepresented in sports. A positive number for the gender gap indicates that girls are underrepresented in sports.
# Filter the gender gap comparison dataframe to show Baltimore County statistics
baltimore_county_gap <- prop_districts %>%
inner_join(prop_crdc, by = c("leaid")) %>%
filter(county == "baltimore") %>%
select(leaid, year, county, lea_name, gender_gap_district, gender_gap_federal, girls_sports_percent_district = girls_sports_percent.x, girls_sports_percent_federal = girls_sports_percent.y, girls_enroll_percent_federal = girls_enroll_percent.y)
output_formatted_table(baltimore_county_gap, "Baltimore County gender gap, district vs. federal, 2017-18")
leaid | year | county | lea_name | gender_gap_district | gender_gap_federal | girls_sports_percent_district | girls_sports_percent_federal | girls_enroll_percent_federal |
---|---|---|---|---|---|---|---|---|
2400120 | 2017-18 | baltimore | Baltimore County Public Schools | 4.8 | -22.8 | 43.7 | 71.5 | 48.7 |
“In Calvert County, a district spokesperson said its 2017-18 federal data submission did not include football, golf, wrestling, baseball and tennis because those sports were coed. According to district data, that means 616 boys and 54 girls were not counted. Federal data suggests girls are overrepresented in sports by six percentage points, but district data shows they are underrepresented by the same amount.”
I contacted school districts about the discrepancies I found between in-house athletics data and federal data. Some did not respond. Several district officials initially said they did not know where the federal data came from, although it is derived from a survey filled out by school officials. At every district where officials responded, I asked about a range of factors that might have caused a discrepancy, and I asked for other possible explanations. All of the factors I considered were ruled out except the single-sex definition upon which the federal data collection is built.
For this Calvert County example, I used district data that was disaggregated by sport. I summed the number of athletes in coed sports to show how many the district said were excluded from federal data. The resulting incompatible gender gaps were calculated in the proportionality and gender gap code above.
# Sum across sports that were considered coed to find the total of boys and girls that were excluded from federal data
calvert_coed <- calvert_2017_18 %>%
filter(sport %in% c("football","baseball", "golf", "tennis", "wrestling")) %>%
group_by(county, year) %>%
summarise(boys_total_participation = sum(boys_participants), girls_total_participation = sum(girls_participants))
# Show Calvert gender gap in federal and district data
calvert_gap <- compare_gender_gap %>%
filter(county == "calvert")
output_formatted_table(calvert_coed, "Total athletes in Calvert County coed sports, 2017-18")
county | year | boys_total_participation | girls_total_participation |
---|---|---|---|
Calvert | 2017-18 | 616 | 54 |
output_formatted_table(calvert_gap, "Calvert County gender gap, district vs. federal, 2017-18")
leaid | year | county | lea_name | gender_gap_district | gender_gap_federal | diff |
---|---|---|---|---|---|---|
2400150 | 2017-18 | calvert | Calvert County Public Schools | 5.8 | -5.5 | 11.3 |
“According to a 2020-21 data tipsheet for schools, when a girl participates on a predominantly male team, such as football or wrestling, the entire team should be excluded from the data. Since that can lead schools to exclude greater numbers of male athletes, it can result in the overall proportion of female athletes appearing higher than is the case. For instance, if Caroline County’s football players were omitted in the last data collection from 2017-18, it would mean that 135 boys — or 15% of the district’s athletes — were not counted in the federal data.”
For this example, I used Caroline County participation data that was disaggregated by sport. I found the total number of athletes in each sport and calculated the percentage of total athletes that each sport represented.
# Calculate total athletes in Caroline County in 2017-18 and define as a variable
boys_total = sum(caroline_2017_18$boys)
girls_total = sum(caroline_2017_18$girls)
athletes_total = boys_total+girls_total
# Calculate the percent of total and look specifically at football
caroline_sports_percent <- caroline_2017_18 %>%
mutate(total_in_sport = boys+girls) %>%
mutate(percent = round((total_in_sport/athletes_total)*100)) %>%
filter(sport == "football")
output_formatted_table(caroline_sports_percent, "Caroline County football players comprised 15% of the district's athletes in 2017-18.")
sport | boys | girls | total_in_sport | percent |
---|---|---|---|---|
football | 135 | 0 | 135 | 15 |
“At 16 of 20 Maryland school districts, federal civil rights data describes a more equitable playing field for female athletes than what districts’ own records show.”
The data in this chart comes from the gender gap comparisons that were calculated above.
output_formatted_table(compare_gender_gap, "Different Data, Different Opportunities")
leaid | year | county | lea_name | gender_gap_district | gender_gap_federal | diff |
---|---|---|---|---|---|---|
2400030 | 2017-18 | allegany | Allegany County Public Schools | 7.7 | 8.0 | -0.3 |
2400060 | 2017-18 | anne arundel | Anne Arundel County Public Schools | 5.9 | -1.0 | 6.9 |
2400120 | 2017-18 | baltimore | Baltimore County Public Schools | 4.8 | -22.8 | 27.6 |
2400150 | 2017-18 | calvert | Calvert County Public Schools | 5.8 | -5.5 | 11.3 |
2400180 | 2017-18 | caroline | Caroline County Public Schools | 7.7 | 3.8 | 3.9 |
2400210 | 2017-18 | carroll | Carroll County Public Schools | 7.6 | 7.5 | 0.1 |
2400240 | 2017-18 | cecil | Cecil County Public Schools | 3.6 | -10.3 | 13.9 |
2400270 | 2017-18 | charles | Charles County Public Schools | 4.4 | 7.8 | -3.4 |
2400300 | 2017-18 | dorchester | Dorchester County Public Schools | 5.7 | 4.5 | 1.2 |
2400330 | 2017-18 | frederick | Frederick County Public Schools | 3.6 | -6.2 | 9.8 |
2400360 | 2017-18 | garrett | Garrett County Public Schools | 8.5 | 3.9 | 4.6 |
2400390 | 2017-18 | harford | Harford County Public Schools | 4.5 | 3.8 | 0.7 |
2400420 | 2017-18 | howard | Howard County Public Schools | 5.9 | 6.0 | -0.1 |
2400450 | 2017-18 | kent | Kent County Public Schools | 3.0 | -6.2 | 9.2 |
2400480 | 2017-18 | montgomery | Montgomery County Public Schools | 4.5 | 2.7 | 1.8 |
2400570 | 2017-18 | somerset | Somerset County Public Schools | 9.6 | 15.5 | -5.9 |
2400600 | 2017-18 | st. mary’s | St. Mary’s County Public Schools | 3.6 | 2.9 | 0.7 |
2400660 | 2017-18 | washington | Washington County Public Schools | 6.0 | -4.0 | 10.0 |
2400690 | 2017-18 | wicomico | Wicomico County Public Schools | 6.7 | -2.0 | 8.7 |
2400720 | 2017-18 | worcester | Worcester County Public Schools | 6.2 | 3.8 | 2.4 |
“50 years after the passage of Title IX, the playing field remains unequal for female athletes”
The data in this chart comes from the calculations of what proportion of enrollment and athletics each gender represents in district records.
md_graphic <- prop_districts %>%
select(county, year, boys_enroll_percent, girls_enroll_percent, boys_sports_percent, girls_sports_percent, gender_gap_district)
output_formatted_table(md_graphic, "Gender Gaps in Maryland High School Sports")
county | year | boys_enroll_percent | girls_enroll_percent | boys_sports_percent | girls_sports_percent | gender_gap_district |
---|---|---|---|---|---|---|
allegany | 2017-18 | 52.9 | 47.1 | 60.6 | 39.4 | 7.7 |
anne arundel | 2017-18 | 50.4 | 49.6 | 56.3 | 43.7 | 5.9 |
baltimore | 2017-18 | 51.5 | 48.5 | 56.3 | 43.7 | 4.8 |
calvert | 2017-18 | 49.4 | 50.6 | 55.2 | 44.8 | 5.8 |
caroline | 2017-18 | 50.8 | 49.2 | 58.5 | 41.5 | 7.7 |
carroll | 2017-18 | 50.8 | 49.2 | 58.4 | 41.6 | 7.6 |
cecil | 2017-18 | 52.0 | 48.0 | 55.6 | 44.4 | 3.6 |
charles | 2017-18 | 52.3 | 47.7 | 56.7 | 43.3 | 4.4 |
dorchester | 2017-18 | 50.1 | 49.9 | 55.8 | 44.2 | 5.7 |
frederick | 2017-18 | 52.0 | 48.0 | 55.6 | 44.4 | 3.6 |
garrett | 2017-18 | 52.0 | 48.0 | 60.5 | 39.5 | 8.5 |
harford | 2017-18 | 51.4 | 48.6 | 55.9 | 44.1 | 4.5 |
howard | 2017-18 | 51.1 | 48.9 | 57.0 | 43.0 | 5.9 |
kent | 2017-18 | 53.3 | 46.7 | 56.3 | 43.7 | 3.0 |
montgomery | 2017-18 | 51.5 | 48.5 | 56.0 | 44.0 | 4.5 |
somerset | 2017-18 | 48.5 | 51.5 | 58.1 | 41.9 | 9.6 |
st. mary’s | 2017-18 | 50.7 | 49.3 | 54.3 | 45.7 | 3.6 |
washington | 2017-18 | 50.8 | 49.2 | 56.8 | 43.2 | 6.0 |
wicomico | 2017-18 | 51.6 | 48.4 | 58.3 | 41.7 | 6.7 |
worcester | 2017-18 | 52.1 | 47.9 | 58.3 | 41.7 | 6.2 |
-30-