Introduction

This data notebook contains the analysis that generated facts in the story “Federal Title IX data on sports participation is unreliable” from the series “Unlevel Playing Fields”. For each sentence in the story generated by original data analysis, I have provided the corresponding code and results.

Load Libraries, Settings and Global Functions

## Libraries

# For general data science
library(tidyverse)

# For data cleaning
library(janitor)

# For loading Excel files
library(readxl)

# For working with datetime
library(lubridate)

# For U.S. Census Bureau data
library(tigris)

# Avoid use of scientific notation
options(scipen = 999)

# For pretty tables
library(kableExtra)
library(knitr)

## Functions 

# Function for loading and transforming districts with 2017_18 data only
districts <- function(county, lea) {
  path=paste0("../data/processed/md/",county,"_2017_18.csv")
  
  district_athletics <- read_csv(path) %>%
  mutate_if(is.numeric, ~replace(., is.na(.), 0))

  district_sums_without_cau <- district_athletics %>% 
    filter(is.na(type)) %>%
    group_by(county, year) %>%
    summarise(boys_total_participation = sum(boys_participants), girls_total_participation = sum(girls_participants)) %>%
    select(county, year, boys_total_participation, girls_total_participation)

  district_sums_all <- district_athletics %>% 
    group_by(county, year) %>%
    summarise(boys_total_participation_cs = sum(boys_participants), girls_total_participation_cs = sum(girls_participants)) %>%
    select(county, year, boys_total_participation_cs, girls_total_participation_cs)
  
 district_final <- district_sums_without_cau %>%
    left_join(district_sums_all) %>%
    mutate(leaid = lea, .before=1)

}

# Function for loading and transforming districts with all years
districts_all_years <- function(county, lea) {
  path=paste0("../data/processed/md/",county,"_all_years.csv")
  
  district_athletics <- read_csv(path) %>%
  filter(year == "2017-18") %>% 
  mutate_if(is.numeric, ~replace(., is.na(.), 0))

  district_sums_without_cau <- district_athletics %>% 
    filter(is.na(type)) %>%
    group_by(county, year) %>%
    summarise(boys_total_participation = sum(boys_participants), girls_total_participation = sum(girls_participants)) %>%
    select(county, year, boys_total_participation, girls_total_participation)

  district_sums_all <- district_athletics %>% 
    group_by(county, year) %>%
    summarise(boys_total_participation_cs = sum(boys_participants), girls_total_participation_cs = sum(girls_participants)) %>%
    select(county, year, boys_total_participation_cs, girls_total_participation_cs)
  
 district_final <- district_sums_without_cau %>%
    left_join(district_sums_all) %>%
    mutate(leaid = lea, .before=1)
}

# Function to calculate the proportion of athletes and enrollment that boys and girls represent in multiple datasets. Also calculates the gender gap using those proportions.
gender_gap <- function(df, male_enrollment, female_enrollment, total_enrollment, boys_sports, girls_sports, total_sports, rounding_digits) {
    x <<- df %>%
      # Calculate percentage of student body that is male and proportion that is female
      mutate(boys_enroll_percent = round(((!!!syms(male_enrollment))/(!!!syms(total_enrollment))*100),rounding_digits), girls_enroll_percent = round(((!!!syms(female_enrollment))/(!!!syms(total_enrollment))*100),rounding_digits)) %>% 
      # Calculate percentage of athletes that are male and proportion that are female
       mutate(boys_sports_percent = round(((!!!syms(boys_sports))/(!!!syms(total_sports))*100),rounding_digits), girls_sports_percent = round(((!!!syms(girls_sports))/(!!!syms(total_sports))*100),rounding_digits)) %>% 
      # Subtract enrollment percentage from athlete percentage to find the gender gap
      mutate(gender_gap = boys_sports_percent-boys_enroll_percent)
}


# Function for formatted table output
output_formatted_table = function(table, text){
  
  table %>% 
    kable(caption = text) %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), font_size = 14, fixed_thead = T) %>%
    scroll_box(width = "100%")
}

Load and Clean Data

Federal Data

Since 2000, the U.S. Department of Education’s Office for Civil Rights has required K-12 schools to submit data on various programs, including athletics, through the Civil Rights Data Collection. The survey typically occurs every two years, however, the cycle was disrupted by the COVID-19 pandemic. At the time of reporting, the latest available CRDC data was from the 2017-18 school year.

The specific CRDC elements I used from the CRDC were from the single-sex athletics questions and enrollment variables.

## CRDC athletic participation data
crdc_2017_18 <- read.csv("../data/source/civil_rights_data_collection_OCR/2017-18-crdc-data-corrected-publication-2/data/crdc_athletics_2017_18.csv") %>% 
  mutate(SCH_SSATHLETICS_IND = na_if(SCH_SSATHLETICS_IND, -9)) %>% 
  mutate(across(where(is.numeric), ~na_if(., "-9"))) %>% 
  clean_names() %>% 
  mutate(lea_name = case_when(
    lea_name == "Washingtion County Public Schools" ~ "Washington County Public Schools",
    TRUE ~ lea_name
  )) 

## CRDC enrollment data
md_crdc_enroll <- read_csv("../data/source/civil_rights_data_collection_OCR/2017-18-crdc-data-corrected-publication-2/data/crdc_enrollment_2017_18.csv") %>%
  clean_names() %>% 
  filter(lea_state == "MD") %>% 
  select(combokey, tot_enr_m, tot_enr_f)

## Join CRDC athletic participation and enrollment data
# Remove schools without single-sex athletics. This includes elementary and middle schools, juvenile justice facilities and some specialized schools. Also filter to Maryland schools.
md_schools_crdc <- crdc_2017_18 %>% 
  filter(sch_ssathletics_ind == "Yes", lea_state == "MD") %>% 
  # The combokeys for Maryland schools are not correct. In other CRDC datasets, the combokeys are generated by pasting together the leaid and the 5-digit school id. Create a corrected combokey here so that we can use these unique identifiers to join with enrollment data for proportionality calculations.
  mutate(combokey_correct = paste0(leaid, str_pad(schid, width = 5, side = "left", pad = "0"))) %>% 
  # Join Maryland athletics data with enrollment data. Using a left join gets rid of schools keeps only relevant schools.
  left_join(md_crdc_enroll, by = c("combokey_correct" = "combokey")) %>% 
  # Create a total enrollment column
  mutate(tot_enr = tot_enr_m + tot_enr_f)

## Calculate districtwide sums from Maryland schools in CRDC
# From CRDC data, group by district (leaid) and sum the boys participation across high schools and the girls participation across high schools
md_crdc <- md_schools_crdc %>% 
  group_by(leaid, lea_state, lea_name) %>%
  summarise(boys_sum = sum(sch_sspart_m), girls_sum = sum(sch_sspart_f), enr_m = sum(tot_enr_m), enr_f = sum(tot_enr_f), enr_total = sum(tot_enr))

District Data

To check if federal data reflected reality, my colleagues and I filed public records requests for athletics participation by gender at each of Maryland’s 24 public school districts. We received usable data from 20 districts. For most districts, we received five years of data, but I focused my analysis on 2017-18 because that was the latest available for federal data.

Baltimore City Public Schools provided data that came from the Office for Civil Rights, so it could not be compared with the type of data we received from other districts.Prince George’s County Public Schools, Queen Anne’s County Public Schools and Talbot County Public Schools did not fulfill our public records requests.

In examining Somerset County Public Schools’ data, I found that the district had recorded 2017-18 cross country boys participation as 116 when it should have been 16. I corrected that within the district data, but the athletic director could not confirm if that error was carried over to the district’s federal data reporting. So the CRDC data for Somerset has been left unchanged.

Standard Format

Most districts provided the athletics data from annual forms they submit to the state athletics association. My colleagues and I extracted the data from those pdfs into csvs. Then I wrote the districts functions (above) to transform the data to a structure that could be compared with federal data.

## Run functions to load and transform athletics data from each district
allegany = districts_all_years("allegany", "2400030")
anne_arundel = districts_all_years("anne_arundel", "2400060")
baltimore_county = districts_all_years("baltimore_county", "2400120")
calvert = districts_all_years("calvert", "2400150")
carroll = districts_all_years("carroll", "2400210")
cecil = districts_all_years("cecil", "2400240")
charles = districts_all_years("charles", "2400270")
dorchester = districts_all_years("dorchester", "2400300")
garrett = districts_all_years("garrett", "2400360")
harford = districts_all_years("harford", "2400390")
howard = districts_all_years("howard", "2400420")
kent = districts("kent", "2400450")
somerset = districts_all_years("somerset", "2400570")
st_marys = districts_all_years("st_marys", "2400600")
washington = districts_all_years("washington", "2400660")
wicomico = districts_all_years("wicomico", "2400690")
worcester = districts_all_years("worcester", "2400720")

Atypical Formats

A few districts provided more detailed in-house data which requires customized transformation scripts.

## Caroline County --------------------
caroline <- read_csv("../data/processed/md/caroline_all_years.csv") %>% 
  mutate_if(is.numeric, ~replace(., is.na(.), 0)) %>%
  group_by(county, year) %>%
  summarise(boys_colrich = sum(boys_participants_colonel_richardson), girls_colrich = sum(girls_participants_colonel_richardson), boys_northcar = sum(boys_participants_north_caroline), girls_northcar = sum(girls_participants_north_caroline)) %>%
  # Add totals from each school. Set corollary sports equal to 0 because we don't have that data from this district
  mutate(boys_total_participation = boys_colrich+boys_northcar, girls_total_participation = girls_colrich+girls_northcar, leaid = "2400180", boys_total_participation_cs = 0, girls_total_participation_cs = 0) %>%
  select(leaid, county, year, boys_total_participation, girls_total_participation, boys_total_participation_cs, girls_total_participation_cs) %>% 
  filter(year == "2017-18")

## Frederick County --------------------
frederick <- read_csv("../data/processed/md/frederick_2017_18.csv") %>% 
  rename(school = 1) %>% 
  slice(1:10) %>%
  mutate(leaid = "2400330", county = "Frederick", year = "2017-18", .before=1) %>% 
  group_by(leaid, county, year) %>% 
  summarise(boys_total_participation = sum(male_athletes), girls_total_participation = sum(female_athletes)) %>% 
  # Set corollary sports equal to 0 because we don't have that data from this district
  mutate(boys_total_participation_cs = 0, girls_total_participation_cs = 0)

## Montgomery County --------------------
# Read in the key that the school provided to match school names, numbers and abbreviations
hs_key <- read_excel("../data/source/maryland_school_districts/montgomery_county_ps/fy22_mcps_high_school_name_key.xlsx", col_names = c("alpha", "school_number", "school_abbreviation", "school_short", "school_long", "school"), skip = 1) %>% 
  mutate(school_number = as.character(school_number)) %>% 
  select(2,3,5) %>% 
  # Add "High" to the end of the school names to match CRDC data
  mutate(school_crdc = paste(school_long,'High', sep=" ")) %>% 
  # Replace Bethesda Chevy Chase abbreviation with the abbreviation that is in the school's participation data
  mutate(school_abbreviation = case_when(
    school_abbreviation == "BC" ~ "BCC",
    TRUE ~ school_abbreviation
  )) %>% 
  # Add "School" to "Northwood High" to match the school's name in the Civil Rights Data Collection data with which this will later be joined
  mutate(school_crdc = case_when(
    school_crdc == "Northwood High" ~ "Northwood High School",
    TRUE ~ school_crdc
  ))

# I found that the district's year-long totals had errors in some calculations, making it necessary to read in each season's data and calculate totals 
# Load fall 2017 data
mcps_athletics_fall_2017_18 <- read_excel("../data/source/maryland_school_districts/montgomery_county_ps/mcps_athletics_participation_2017_18.xlsx", sheet = 1, col_names = c("school", "cc_v_boys",    "cc_v_girls",   "cc_v_tot", "fh_v_girls",   "fh_jv_girls",  "fh_tot_girls", "fb_v_boys",    "fb_jv_boys",   "fb_tot_boys",  "golf_v_boys",  "golf_v_girls", "golf_v_tot",   "soc_v_boys",   "soc_jv_boys",  "soc_tot_boys", "soc_v_girls",  "soc_jv_girls", "soc_tot_girls",    "tennis_v_girls",   "vb_v_girls",   "vb_jv_girls",  "vb_tot_girls", "hand_v_boys",  "hand_v_girls", "hand_v_tot",   "varsity_total_boys",   "varsity_total_girls",  "varsity_total",    "jv_total_boys",    "jv_total_girls",   "jv_total", "total_participation_boys", "total_participation_girls",    "total_participation",  "cheer_v_boys", "cheer_jv_boys",    "cheer_v_girls",    "cheer_jv_girls",   "cheer_total",  "poms_v_girls", "cheer_poms_tot_boys",  "cheer_poms_tot_girls", "cheer_pom_total",  "grand_total_all"), skip = 3) %>% 
  filter(school != "TOTAL" & school !="MOST" & school !="LEAST" & school !="AVG/25") %>%
  mutate(cc_v_boys = as.numeric(cc_v_boys)) 

# Calculate fall 2017 total participation by gender
mcps_fall_totals <- mcps_athletics_fall_2017_18 %>% 
  # Calculate boys total without handball, which is a corollary sport, and excluding cheer and poms (?)
  mutate(boys_fall_tot_part = cc_v_boys + fb_v_boys + fb_jv_boys + golf_v_boys + soc_v_boys + soc_jv_boys) %>% 
  # Calculate girls total without handball, which is a corollary sport, and excluding cheer and poms (?)
  mutate(girls_fall_tot_part = cc_v_girls + fh_v_girls + fh_jv_girls + golf_v_girls + soc_v_girls + soc_jv_girls + tennis_v_girls + vb_v_girls + vb_jv_girls) %>% 
  # Calculate boys total with corollary sports (handball)
  mutate(boys_fall_tot_part_cs = boys_fall_tot_part + hand_v_boys) %>% 
  # Calculate girls total with corollary sports (handball)
  mutate(girls_fall_tot_part_cs = girls_fall_tot_part + hand_v_girls)

# Load winter 2017-18 participation data
mcps_athletics_winter_2017_18 <- read_excel("../data/source/maryland_school_districts/montgomery_county_ps/mcps_athletics_participation_2017_18.xlsx", sheet = 2, col_names = c("school", "bball_v_boys", "bball_jv_boys", "bball_tot_boys", "bball_v_girls", "bball_jv_girls", "bball_tot_girls", "intrk_v_boys", "intrk_v_girls", "intrk_v_tot", "swim_v_boys", "swim_v_girls", "swim_v_tot", "wrestlg_v_boys", "wrestlg_jv_boys", "wrestlg_tot_girls", "wrestlg_tot", "bocce_v_boys", "bocce_v_girls", "bocce_tot", "varsity_total_boys", "varsity_total_girls", "varsity_total", "jv_total_boys", "jv_total_girls","jv_total", "total_participation_boys", "total_participation_girls", "total_participation", "cheer_v_boys", "cheer_v_girls", "cheer_jv_girls", "cheer_total", "poms_v_girls", "cheer_poms_tot_boys", "cheer_poms_tot_girls", "cheer_pom_total", "grand_total_all"), skip = 3) %>%
  filter(school != "TOTAL" & school !="MOST" & school !="LEAST" & school !="AVG/25" & school !="NA") %>% 
  mutate(bball_jv_boys = as.numeric(bball_jv_boys))

# Calculate winter total participation by gender
mcps_winter_totals <- mcps_athletics_winter_2017_18 %>% 
  # Calculate boys total without bocce, which is a corollary sport, and excluding cheer and poms (?)
  mutate(boys_winter_tot_part = bball_v_boys + bball_jv_boys + intrk_v_boys + swim_v_boys + wrestlg_v_boys + wrestlg_jv_boys)%>%
  # Calculate girls total without bocce, which is a corollary sport, and excluding cheer and poms (?)
  mutate(girls_winter_tot_part = bball_v_girls + bball_jv_girls + intrk_v_girls + swim_v_girls + wrestlg_tot_girls) %>% 
  # Calculate boys total with corollary sports (bocce)
  mutate(boys_winter_tot_part_cs = boys_winter_tot_part + bocce_v_boys) %>% 
  # Calculate girls total with corollary sports (bocce)
  mutate(girls_winter_tot_part_cs = girls_winter_tot_part + bocce_v_girls) 

# Load spring 2018 participation data
mcps_athletics_spring_2017_18 <- read_excel("../data/source/maryland_school_districts/montgomery_county_ps/mcps_athletics_participation_2017_18.xlsx", sheet = 3, col_names = c("school",   "base_v_boys",  "base_jv_boys", "base_tot_boys",    "gym_v_girls",  "lax_v_boys",   "lax_jv_boys",  "lax_tot_boys", "lax_v_girls",  "lax_jv_girls", "lax_tot_girls",    "soft_v_girls", "soft_jv_girls",    "soft_tot_girls",   "tennis_v_boys",    "trk_v_boys",   "trk_v_girls",  "trk_tot",  "vball_v_boys", "vball_co_v_boys",  "vball_co_girls",   "vball_co_tot", "allied_soft_v_boys",   "allied_soft_v_girls",  "allied_soft_tot",  "varsity_total_boys",   "varsity_total_girls",  "varsity_total",    "jv_total_boys",    "jv_total_girls",   "jv_total", "total_participation_boys", "total_participation_girls",    "total_participation"), skip = 3) %>% 
  filter(school != "TOTAL" & school !="MOST" & school !="LEAST" & school !="AVG/25" & school !="NA") %>% 
  mutate(base_v_boys = as.numeric(base_v_boys))

# Calculate spring total participation by gender
mcps_spring_totals <- mcps_athletics_spring_2017_18 %>% 
  # Calculate boys total without allied softball, which is a corollary sport   
  mutate(boys_spring_tot_part = base_v_boys + base_jv_boys + lax_v_boys + lax_jv_boys + tennis_v_boys + trk_v_boys + vball_v_boys + vball_co_v_boys) %>%
  # Calculate girls total without allied softball, which is a corollary sport 
  mutate(girls_spring_tot_part = gym_v_girls + lax_v_girls + lax_jv_girls + soft_v_girls + soft_jv_girls + trk_v_girls + vball_co_girls) %>% 
  # Calculate boys total with corollary sports (allied softball)
  mutate(boys_spring_tot_part_cs = boys_spring_tot_part + allied_soft_v_boys) %>% 
  # Calculate girls total with corollary sports (allied softball)
  mutate(girls_spring_tot_part_cs = girls_spring_tot_part + allied_soft_v_girls)

# Create dataframes that includes only the columns needed for calculating the year-long totals
# Fall
mcps_fall_totals_clean <- mcps_fall_totals %>%
  select(school, boys_fall_tot_part, girls_fall_tot_part, boys_fall_tot_part_cs, girls_fall_tot_part_cs)
# Winter
mcps_winter_totals_clean <- mcps_winter_totals %>%
  select(school, boys_winter_tot_part, girls_winter_tot_part, boys_winter_tot_part_cs, girls_winter_tot_part_cs) 
# Spring
mcps_spring_totals_clean <- mcps_spring_totals %>%
  select(school, boys_spring_tot_part, girls_spring_tot_part, boys_spring_tot_part_cs, girls_spring_tot_part_cs)

# Join the fall, winter, spring dataframes
mcps_totals_2017_18 <- mcps_fall_totals_clean %>% 
  inner_join(mcps_winter_totals_clean) %>% 
  inner_join(mcps_spring_totals_clean) %>% 
  # Add the boys totals without corollary sports
  mutate(boys_total_participation = boys_fall_tot_part+boys_winter_tot_part+boys_spring_tot_part) %>% 
  # Add the girls total without corollary sports
  mutate(girls_total_participation = girls_fall_tot_part + girls_winter_tot_part + girls_spring_tot_part) %>% 
  # Add the boys totals without corollary sports
  mutate(boys_total_participation_cs = boys_fall_tot_part_cs + boys_winter_tot_part_cs + boys_spring_tot_part_cs) %>% 
  # Add the girls totals without corollary sports
  mutate(girls_total_participation_cs = girls_fall_tot_part_cs + girls_winter_tot_part_cs + girls_spring_tot_part_cs)

# Select only the participation columns needed for comparing to CRDC data
montgomery <- mcps_totals_2017_18 %>% 
  select(school, boys_total_participation, girls_total_participation, boys_total_participation_cs, girls_total_participation_cs) %>% 
  # Separate the school abbreviation and number
  separate(school, c('school_abbreviation', 'school_number')) %>%
  # Join with high school key for names
  inner_join(hs_key) %>% 
  # Remove extra columns from high school key
  select(8,1:6) %>% 
  # Rename school name column to match CRDC format
  rename(sch_name = school_crdc) %>% 
  # Add a column to identify the year to which this data applies
  mutate(year = "2017-18", .before = sch_name) %>% 
  group_by(year) %>%
  summarise(boys_total_participation = sum(boys_total_participation), girls_total_participation = sum(girls_total_participation),  boys_total_participation_cs = sum(boys_total_participation_cs), girls_total_participation_cs = sum(girls_total_participation_cs)) %>%
  mutate(leaid = "2400480", county = "Montgomery", .before=1)

Create dataframe with all districts’ athletic participation totals

After loading and transforming the data from each district, I bound all of it into one data frame for comparison with federal data.

md_districts <- rbind(allegany, anne_arundel, baltimore_county, calvert, caroline, carroll, cecil, charles, dorchester, frederick, garrett, harford, howard, kent, montgomery, somerset, st_marys, washington, wicomico, worcester) %>% 
   mutate(county = tolower(county))

District Data By Sport

Most of the analysis looked at athletic participation totals by gender. For two examples from specific districts, I needed to look at details about male and female athletes disaggregated by sport.

# Calvert County Public Schools --------------------
calvert_2017_18 <- read_csv("../data/processed/md/calvert_all_years.csv") %>%
  filter(year == "2017-18") %>% 
  mutate_if(is.numeric, ~replace(., is.na(.), 0)) %>% 
  mutate(sport = tolower(sport)) %>% 
  # Filter to only non-corollary sports
  filter(is.na(type))

# Caroline County Public Schools --------------------
caroline_2017_18 <- read_csv("../data/processed/md/caroline_all_years.csv") %>% 
  mutate_if(is.numeric, ~replace(., is.na(.), 0)) %>%
  mutate(sport = tolower(sport)) %>%
  filter(year == "2017-18") %>% 
  # Get totals of boys and girls by sport
  group_by(sport) %>% 
  summarise(boys = sum(boys_participation), girls = sum(girls_participation))

State Data

I acquired high school enrollment data from the Maryland State Department of Education to be used in comparing district athletics participation rates to enrollment by gender.

# Load 2017-18 school district enrollment for grades 9-12
enroll_2017_18 <- read_csv("../data/source/msde/msde_enrollment_2017_18.csv") %>% 
  mutate(county = tolower(county)) %>% 
  select(1:5)

# Check that the enrollment total column is accurate
#enroll_check_totals <- read_csv("../data/source/msde/msde_enrollment_2017_18.csv") %>% 
  #mutate(tot_check = male_9_12 + female_9_12) %>% 
  #mutate(status = ifelse(tot_check == enrollment_9_12, TRUE, FALSE))

Line-By-Line Fact-Check

FACT: Figures collected by the Education Department’s Office for Civil Rights are incomplete and differ substantially from statistics kept by school districts

“In a number of cases, figures collected by the Education Department’s Office for Civil Rights are incomplete and differ substantially from statistics kept by school districts, an analysis by the Shirley Povich Center for Sports Journalism and Howard Center for Investigative Journalism at the University of Maryland shows.”

Explanation

I compared the raw numbers of athletes of each gender in district data to the same numbers in federal data. In most cases, the number of athletes was higher in district data than federal data. And the discrepancy was usually larger for boys than girls.

District data also included totals of athletes playing corollary sports, so I checked if including those participants accounted for the discrepancies. It did not. In fact, since most district totals were higher than federal totals, adding corollary sports widened the discrepancies between district and federal numbers.

Supporting code and output

# Join district data to Maryland sums. We have district data for 20 districts. CRDC data includes 24 districts and three specialized schools, so in this join we lose seven rows from CRDC data.     
districts_compare <- md_districts %>% 
  inner_join(md_crdc) %>% 
  # Create new columns that compare district totals to CRDC participation totals by gender without corollary sports
  mutate(boys_check = ifelse(boys_sum == boys_total_participation, TRUE, FALSE), girls_check = ifelse(girls_sum == girls_total_participation, TRUE, FALSE)) %>% 
  # Create new columns that compare district totals to CRDC participation totals by gender with corollary sports
  mutate(boys_check_cs = ifelse(boys_sum == boys_total_participation_cs, TRUE, FALSE), girls_check_cs = ifelse(girls_sum == girls_total_participation_cs, TRUE, FALSE)) %>% 
  # Calculate differences between district totals and CRDC participation totals by gender without corollary sports
  mutate(boys_diff = boys_total_participation - boys_sum, girls_diff = girls_total_participation - girls_sum) %>% 
  # Calculate differences between district totals and CRDC participation totals by gender without corollary sports
  mutate(boys_diff_cs = boys_total_participation_cs - boys_sum, girls_diff_cs = girls_total_participation_cs - girls_sum)

# The only instance where a district total matches the federal total is for male athletes in Howard County
check <- districts_compare %>% 
  filter(boys_check == TRUE | girls_check == TRUE | boys_check_cs == TRUE | girls_check_cs == TRUE)


output_formatted_table(districts_compare$boys_diff, "Difference between boys athletic participation totals in district and federal data")
Difference between boys athletic participation totals in district and federal data
x
49
2969
7474
563
83
1073
889
1414
-70
217
106
33
0
174
1121
32
115
554
365
96
output_formatted_table(districts_compare$girls_diff, "Difference between girls athletic participation totals in district and federal data")
Difference between girls athletic participation totals in district and federal data
x
20
1681
4526
-11
1
753
348
1175
-78
-761
16
-76
2
110
280
57
50
3
21
16
output_formatted_table(check, "The only instance where a district total matches the federal total is for male athletes in Howard County.")
The only instance where a district total matches the federal total is for male athletes in Howard County.
leaid county year boys_total_participation girls_total_participation boys_total_participation_cs girls_total_participation_cs lea_state lea_name boys_sum girls_sum enr_m enr_f enr_total boys_check girls_check boys_check_cs girls_check_cs boys_diff girls_diff boys_diff_cs girls_diff_cs
2400420 howard 2017-18 5395 4066 5651 4159 MD Howard County Public Schools 5395 4064 8729 8395 17124 TRUE FALSE FALSE FALSE 0 2 256 95

FACT: At 16 of 20 Maryland districts, federal data describes a more favorable situation for female athletes than district records show

“The Povich and Howard centers analyzed athletics participation at the 20 Maryland public school districts that provided usable data and found that, in all but four cases, federal data describes a more favorable situation for female athletes than what the districts’ own records show.”

Explanation

Under Title IX regulations, equity in high school sports participation is measured by comparing the percentages of male and female athletes at a school with the percentages of male and female students overall. I calculated those percentages in both the district and federal datasets. To determine the gender gap, I subtracted the boys enrollment percentage from the boys sports percentage. (The absolute values are the same whether you use boys or girls data for the subtraction. That’s also the case if you reverse the order of the variables in the subtraction.) Both steps were accomplished with the gender_gap function I defined at the start of this notebook.

Then, for each district I subtracted the gender gap that federal data shows from the gender gap that district records show. Most districts had a larger gender gap according to district records than federal data shows. Four districts did not follow this pattern: Somerset, Charles, Allegany and Howard. See note about reliability of Somerset data in the data-loading section of this notebook.

Supporting code and output

## Calculate proportionality in each dataset
# Join district athletics participation data with enrollment data
districts_sports_and_enrollment <- md_districts %>%
  select(1:5) %>% 
  mutate(total_participation = boys_total_participation + girls_total_participation) %>% 
  left_join(enroll_2017_18)

# Get proportionality and gender gap for each district based on district data
prop_districts = gender_gap(districts_sports_and_enrollment, "male_9_12", "female_9_12", "enrollment_9_12", "boys_total_participation", "girls_total_participation", "total_participation", 1) %>%
  rename(gender_gap_district = gender_gap)

# Get proportionality and gender gap for each district based on CRDC data
prop_crdc <- md_crdc %>% 
  mutate(crdc_total_sports = boys_sum+girls_sum) %>% 
  gender_gap("enr_m", "enr_f", "enr_total", "boys_sum", "girls_sum", "crdc_total_sports", 1) %>%
  rename(gender_gap_federal = gender_gap)

## Compare gender gaps for each district in the two datasets
compare_gender_gap <- prop_districts %>% 
  inner_join(prop_crdc, by = c("leaid")) %>% 
  select(leaid, year, county, lea_name, gender_gap_district, gender_gap_federal) %>% 
  mutate(diff = gender_gap_district-gender_gap_federal)


## Filter for districts where the difference between the gender gaps is negative. These are districts where the federal data does NOT describe a more favorable situation for girls than what district records show.
outliers <- compare_gender_gap %>% 
  filter(diff <0)

output_formatted_table(compare_gender_gap, "Most districts analyzed had a larger gender gap according to district records than federal data shows.")
Most districts analyzed had a larger gender gap according to district records than federal data shows.
leaid year county lea_name gender_gap_district gender_gap_federal diff
2400030 2017-18 allegany Allegany County Public Schools 7.7 8.0 -0.3
2400060 2017-18 anne arundel Anne Arundel County Public Schools 5.9 -1.0 6.9
2400120 2017-18 baltimore Baltimore County Public Schools 4.8 -22.8 27.6
2400150 2017-18 calvert Calvert County Public Schools 5.8 -5.5 11.3
2400180 2017-18 caroline Caroline County Public Schools 7.7 3.8 3.9
2400210 2017-18 carroll Carroll County Public Schools 7.6 7.5 0.1
2400240 2017-18 cecil Cecil County Public Schools 3.6 -10.3 13.9
2400270 2017-18 charles Charles County Public Schools 4.4 7.8 -3.4
2400300 2017-18 dorchester Dorchester County Public Schools 5.7 4.5 1.2
2400330 2017-18 frederick Frederick County Public Schools 3.6 -6.2 9.8
2400360 2017-18 garrett Garrett County Public Schools 8.5 3.9 4.6
2400390 2017-18 harford Harford County Public Schools 4.5 3.8 0.7
2400420 2017-18 howard Howard County Public Schools 5.9 6.0 -0.1
2400450 2017-18 kent Kent County Public Schools 3.0 -6.2 9.2
2400480 2017-18 montgomery Montgomery County Public Schools 4.5 2.7 1.8
2400570 2017-18 somerset Somerset County Public Schools 9.6 15.5 -5.9
2400600 2017-18 st. mary’s St. Mary’s County Public Schools 3.6 2.9 0.7
2400660 2017-18 washington Washington County Public Schools 6.0 -4.0 10.0
2400690 2017-18 wicomico Wicomico County Public Schools 6.7 -2.0 8.7
2400720 2017-18 worcester Worcester County Public Schools 6.2 3.8 2.4
output_formatted_table(outliers, "Four districts did not follow this pattern: Somerset, Charles, Allegany and Howard.")
Four districts did not follow this pattern: Somerset, Charles, Allegany and Howard.
leaid year county lea_name gender_gap_district gender_gap_federal diff
2400030 2017-18 allegany Allegany County Public Schools 7.7 8.0 -0.3
2400270 2017-18 charles Charles County Public Schools 4.4 7.8 -3.4
2400420 2017-18 howard Howard County Public Schools 5.9 6.0 -0.1
2400570 2017-18 somerset Somerset County Public Schools 9.6 15.5 -5.9

FACT: At 40% of districts, federal data indicates girls outnumber boys in sports, but district data tells a different story

“In fact, at 40% of districts, federal data indicates that when compared to their proportion of enrollment, girls outnumber boys in sports. But schools’ own data tells a different story: All districts have fewer opportunities for female athletes.”

Explanation

In my gender gap calculations, a negative number indicates that girls’ percentage of athletes is higher than their proportion of enrollment, meaning they are overrepresented in sports. When looking at federal data, that scenario occurs in 8 of 20 districts (40%). When looking at district data, it never occurs.

Supporting code and output

# Make a new dataframe from gender gap comparison
girls_outnumber <- compare_gender_gap %>% 
  # Create new columns showing whether girls outnumber boys, proportionally, in the federal data and district data
  mutate(g_on_district = ifelse(gender_gap_district < 0, TRUE, FALSE), g_on_federal = ifelse(gender_gap_federal < 0, TRUE, FALSE))

# Define a variable that is the percentage of districts where federal data suggests girls outnumber boys in sports
percent_overrep_fed = sum(girls_outnumber$g_on_federal == TRUE)/nrow(girls_outnumber)*100 

# Define a variable that is the percentage of districts where district data suggests girls outnumber boys in sports
percent_overrep_district = sum(girls_outnumber$g_on_district == TRUE)/nrow(girls_outnumber)*100

output_formatted_table(percent_overrep_fed, "Percent of districts where federal data suggests girls are overreprsented in sports")
Percent of districts where federal data suggests girls are overreprsented in sports
x
40
output_formatted_table(percent_overrep_district, "Percent of districts where district data suggests girls are overreprsented in sports")
Percent of districts where district data suggests girls are overreprsented in sports
x
0

FACT: For Baltimore County Public Schools, federal data suggests girls are overrepresented but district data says they are underrepresented

“In the large suburban district of Baltimore County, for example, federal data paints a picture of sports fields teeming with female athletes. The Office for Civil Rights’ public website says that, as of the 2017-18 school year, girls comprised 49% of enrollment in the district and 72% of athletes — making girls overrepresented in sports by 23 percentage points.

But the district’s in-house athletics data, which the Povich and Howard centers obtained through a public records request, says that girls actually comprised 44% of athletes, meaning they were underrepresented in sports by about five percentage points.”

Explanation

In my gender gap calculations, a negative number indicates that girls’ percentage of athletes is higher than their proportion of enrollment, meaning they are overrepresented in sports. A positive number for the gender gap indicates that girls are underrepresented in sports.

Supporting code and output

# Filter the gender gap comparison dataframe to show Baltimore County statistics
baltimore_county_gap <- prop_districts %>% 
  inner_join(prop_crdc, by = c("leaid")) %>% 
  filter(county == "baltimore") %>%
  select(leaid, year, county, lea_name, gender_gap_district, gender_gap_federal, girls_sports_percent_district = girls_sports_percent.x, girls_sports_percent_federal = girls_sports_percent.y, girls_enroll_percent_federal = girls_enroll_percent.y)

output_formatted_table(baltimore_county_gap, "Baltimore County gender gap, district vs. federal, 2017-18")
Baltimore County gender gap, district vs. federal, 2017-18
leaid year county lea_name gender_gap_district gender_gap_federal girls_sports_percent_district girls_sports_percent_federal girls_enroll_percent_federal
2400120 2017-18 baltimore Baltimore County Public Schools 4.8 -22.8 43.7 71.5 48.7

FACT: In Calvert County, federa data submission excluded 616 boys and 54 girls

“In Calvert County, a district spokesperson said its 2017-18 federal data submission did not include football, golf, wrestling, baseball and tennis because those sports were coed. According to district data, that means 616 boys and 54 girls were not counted. Federal data suggests girls are overrepresented in sports by six percentage points, but district data shows they are underrepresented by the same amount.”

Explanation

I contacted school districts about the discrepancies I found between in-house athletics data and federal data. Some did not respond. Several district officials initially said they did not know where the federal data came from, although it is derived from a survey filled out by school officials. At every district where officials responded, I asked about a range of factors that might have caused a discrepancy, and I asked for other possible explanations. All of the factors I considered were ruled out except the single-sex definition upon which the federal data collection is built.

For this Calvert County example, I used district data that was disaggregated by sport. I summed the number of athletes in coed sports to show how many the district said were excluded from federal data. The resulting incompatible gender gaps were calculated in the proportionality and gender gap code above.

Supporting code and output

# Sum across sports that were considered coed to find the total of boys and girls that were excluded from federal data
calvert_coed <- calvert_2017_18 %>%
  filter(sport %in% c("football","baseball", "golf", "tennis", "wrestling")) %>% 
  group_by(county, year) %>%
  summarise(boys_total_participation = sum(boys_participants), girls_total_participation = sum(girls_participants))


# Show Calvert gender gap in federal and district data
calvert_gap <- compare_gender_gap %>% 
  filter(county == "calvert")

output_formatted_table(calvert_coed, "Total athletes in Calvert County coed sports, 2017-18")
Total athletes in Calvert County coed sports, 2017-18
county year boys_total_participation girls_total_participation
Calvert 2017-18 616 54
output_formatted_table(calvert_gap, "Calvert County gender gap, district vs. federal, 2017-18")
Calvert County gender gap, district vs. federal, 2017-18
leaid year county lea_name gender_gap_district gender_gap_federal diff
2400150 2017-18 calvert Calvert County Public Schools 5.8 -5.5 11.3

FACT: In Caroline County, if football were excluded, 15% of the district’s athletes weren’t counted.

“According to a 2020-21 data tipsheet for schools, when a girl participates on a predominantly male team, such as football or wrestling, the entire team should be excluded from the data. Since that can lead schools to exclude greater numbers of male athletes, it can result in the overall proportion of female athletes appearing higher than is the case. For instance, if Caroline County’s football players were omitted in the last data collection from 2017-18, it would mean that 135 boys — or 15% of the district’s athletes — were not counted in the federal data.”

Explanation

For this example, I used Caroline County participation data that was disaggregated by sport. I found the total number of athletes in each sport and calculated the percentage of total athletes that each sport represented.

Supporting code and output

# Calculate total athletes in Caroline County in 2017-18 and define as a variable
boys_total = sum(caroline_2017_18$boys)
girls_total = sum(caroline_2017_18$girls)
athletes_total = boys_total+girls_total

# Calculate the percent of total and look specifically at football  
caroline_sports_percent <- caroline_2017_18 %>% 
  mutate(total_in_sport = boys+girls) %>% 
  mutate(percent = round((total_in_sport/athletes_total)*100)) %>% 
  filter(sport == "football")

output_formatted_table(caroline_sports_percent, "Caroline County football players comprised 15% of the district's athletes in 2017-18.")
Caroline County football players comprised 15% of the district’s athletes in 2017-18.
sport boys girls total_in_sport percent
football 135 0 135 15

GRAPHIC: Different Data, Different Opportunities

“At 16 of 20 Maryland school districts, federal civil rights data describes a more equitable playing field for female athletes than what districts’ own records show.”

Explanation

The data in this chart comes from the gender gap comparisons that were calculated above.

output_formatted_table(compare_gender_gap, "Different Data, Different Opportunities")
Different Data, Different Opportunities
leaid year county lea_name gender_gap_district gender_gap_federal diff
2400030 2017-18 allegany Allegany County Public Schools 7.7 8.0 -0.3
2400060 2017-18 anne arundel Anne Arundel County Public Schools 5.9 -1.0 6.9
2400120 2017-18 baltimore Baltimore County Public Schools 4.8 -22.8 27.6
2400150 2017-18 calvert Calvert County Public Schools 5.8 -5.5 11.3
2400180 2017-18 caroline Caroline County Public Schools 7.7 3.8 3.9
2400210 2017-18 carroll Carroll County Public Schools 7.6 7.5 0.1
2400240 2017-18 cecil Cecil County Public Schools 3.6 -10.3 13.9
2400270 2017-18 charles Charles County Public Schools 4.4 7.8 -3.4
2400300 2017-18 dorchester Dorchester County Public Schools 5.7 4.5 1.2
2400330 2017-18 frederick Frederick County Public Schools 3.6 -6.2 9.8
2400360 2017-18 garrett Garrett County Public Schools 8.5 3.9 4.6
2400390 2017-18 harford Harford County Public Schools 4.5 3.8 0.7
2400420 2017-18 howard Howard County Public Schools 5.9 6.0 -0.1
2400450 2017-18 kent Kent County Public Schools 3.0 -6.2 9.2
2400480 2017-18 montgomery Montgomery County Public Schools 4.5 2.7 1.8
2400570 2017-18 somerset Somerset County Public Schools 9.6 15.5 -5.9
2400600 2017-18 st. mary’s St. Mary’s County Public Schools 3.6 2.9 0.7
2400660 2017-18 washington Washington County Public Schools 6.0 -4.0 10.0
2400690 2017-18 wicomico Wicomico County Public Schools 6.7 -2.0 8.7
2400720 2017-18 worcester Worcester County Public Schools 6.2 3.8 2.4

GRAPHIC: Gender Gaps in Maryland High School Sports

“50 years after the passage of Title IX, the playing field remains unequal for female athletes”

Explanation

The data in this chart comes from the calculations of what proportion of enrollment and athletics each gender represents in district records.

md_graphic <- prop_districts %>% 
  select(county, year, boys_enroll_percent, girls_enroll_percent, boys_sports_percent, girls_sports_percent, gender_gap_district)

output_formatted_table(md_graphic, "Gender Gaps in Maryland High School Sports")
Gender Gaps in Maryland High School Sports
county year boys_enroll_percent girls_enroll_percent boys_sports_percent girls_sports_percent gender_gap_district
allegany 2017-18 52.9 47.1 60.6 39.4 7.7
anne arundel 2017-18 50.4 49.6 56.3 43.7 5.9
baltimore 2017-18 51.5 48.5 56.3 43.7 4.8
calvert 2017-18 49.4 50.6 55.2 44.8 5.8
caroline 2017-18 50.8 49.2 58.5 41.5 7.7
carroll 2017-18 50.8 49.2 58.4 41.6 7.6
cecil 2017-18 52.0 48.0 55.6 44.4 3.6
charles 2017-18 52.3 47.7 56.7 43.3 4.4
dorchester 2017-18 50.1 49.9 55.8 44.2 5.7
frederick 2017-18 52.0 48.0 55.6 44.4 3.6
garrett 2017-18 52.0 48.0 60.5 39.5 8.5
harford 2017-18 51.4 48.6 55.9 44.1 4.5
howard 2017-18 51.1 48.9 57.0 43.0 5.9
kent 2017-18 53.3 46.7 56.3 43.7 3.0
montgomery 2017-18 51.5 48.5 56.0 44.0 4.5
somerset 2017-18 48.5 51.5 58.1 41.9 9.6
st. mary’s 2017-18 50.7 49.3 54.3 45.7 3.6
washington 2017-18 50.8 49.2 56.8 43.2 6.0
wicomico 2017-18 51.6 48.4 58.3 41.7 6.7
worcester 2017-18 52.1 47.9 58.3 41.7 6.2

-30-