Deepak Bastola
  • Research Projects
  • Courses
  • CV
  • Packages

On this page

  • Introduction
  • Setup
  • 1. Scrape the Data
  • 2. Clean & Tidy
  • 3. Interactive Map
  • 3. Interactive Data Tables
  • 4. Interactive Top-10 Table
  • 5. Publication-Ready Plots
    • 5.1 Top 10 States by Avg Cases
    • 5.2 Cases vs Hospitalizations per 100 k
    • Another Rea–world Example

Scraping & Visualizing COVID-19 Data with tabela

Introduction

The tabela package provides a streamlined interface for scraping and parsing HTML tables and downloadable files directly from web pages. Educators and students can leverage tabela to:

  • Data-driven Teaching Materials: Rapidly prototype datasets for classroom exercises by extracting tables from web sources.
  • Real-time Reporting: Integrate up-to-date data into interactive reports and dashboards for demonstrations and assignments.
  • Automated Data Ingestion: Streamline workflows by programmatically collecting tabular data from most educational websites.

Setup

# install.packages(c("tabela","dplyr","ggplot2","ggpubr","ggthemes","gt"))
library(tabela)
library(dplyr)
library(ggplot2)
library(ggpubr)
library(ggthemes)
library(readr)
library(stringr)
library(gt)
library(ggmap)
library(ggplot2)
library(maps)
library(plotly)
library(viridis)  # for a nice color scale
library(xml2)
library(purrr) 
library(rvest)

1. Scrape the Data

# target URL
url <- "https://usafacts.org/visualizations/coronavirus-covid-19-spread-map/"

# Start a polite session
session <- init_session(url)
print(session$url)    
[1] "https://usafacts.org/visualizations/coronavirus-covid-19-spread-map/"
# Pull out every hyperlink on the page
all_links <- get_all_links(session)
length(all_links)   
[1] 103
all_links |> 
  tail() |>
  knitr::kable()
x
https://www.threads.net/@usafacts
https://www.tiktok.com/@usafacts.org?lang=en
https://www.youtube.com/channel/UChwTFs3rYtFi-6ujmKtv-9Q
https://www.reddit.com/user/USAFacts/
https://usafacts.org/terms-and-conditions/
https://usafacts.org/privacy-policy/
# scrape only HTML tables
res <- scrape_page_data(
  root_url    = url,
  exts        = character(),   # skip file links
  list_tables = TRUE
)
Scraped page 1 (1–0 of 0 files)
  → Found 2 HTML tables on this page
# Filter down to, say, only CSV or TXT files
file_links <- filter_links_by_ext(all_links, exts = c("csv","txt"))
file_links  
[1] "https://static.usafacts.org/public/data/covid-19/covid_confirmed_usafacts.csv"        
[2] "https://static.usafacts.org/public/data/covid-19/covid_deaths_usafacts.csv"           
[3] "https://static.usafacts.org/public/data/covid-19/covid_county_population_usafacts.csv"

2. Clean & Tidy

# 1. Prep the data
county_population <- read_csv(file_links[3])

# Inspect
glimpse(county_population)
Rows: 3,195
Columns: 4
$ countyFIPS    <dbl> 0, 1001, 1003, 1005, 1007, 1009, 1011, 1013, 1015, 1017,…
$ `County Name` <chr> "Statewide Unallocated", "Autauga County", "Baldwin Coun…
$ State         <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "A…
$ population    <dbl> 0, 55869, 223234, 24686, 22394, 57826, 10101, 19448, 113…
md_pop <- county_population %>%
  filter(State == "MD") %>%
  mutate(
    subregion = tolower(str_remove(`County Name`, " County"))
  )

# 2. Get Maryland county boundaries
md_counties <- map_data("county") |>
  filter(region == "maryland")

3. Interactive Map

# 3. Compute bounding box for MD
bbox <- c(
  left   = min(md_counties$long),
  bottom = min(md_counties$lat),
  right  = max(md_counties$long),
  top    = max(md_counties$lat)
)

# 4. Fetch a Stamen terrain‐background map via Stadia Maps
stadia_map <- get_stadiamap(
  bbox     = bbox,
  zoom     = 7,
  maptype  = "stamen_terrain_background",
  crop     = TRUE
)

# 5. Overlay county polygons and make interactive
p <- ggmap(stadia_map) +
  geom_polygon(
    data = md_counties |> left_join(md_pop, by = "subregion"),
    aes(
      x    = long,
      y    = lat,
      group= group,
      fill = population,
      text = paste0(str_to_title(subregion), ": ", format(population, big.mark = ","))
    ),
    color = "white", size = 0.2, alpha = 0.8
  ) +
  scale_fill_viridis_c(option = "magma", na.value = "grey90") +
  theme_void() +
  labs(
    title = "Maryland County Populations (USAFacts)",
    fill  = "Population"
  )

ggplotly(p, tooltip = "text")

3. Interactive Data Tables

# extract the second table (states data)
covid_tbl <- res$tables[[2]]
covid_tbl |> 
  head() |> 
  knitr::kable()
State 7-day avg. cases 7-day avg. deaths Cases Deaths 7-day avg. hospitalizations 7-day avg. hospitalizations per 100k
AL 0 0 1,659,936 21,138 30 0.6
AK 0 0 287,319 1,457 4 0.6
AZ 0 0 2,486,671 29,852 152 2.0
AR 0 0 977,662 13,062 30 1.0
CA 128 0 11,300,486 102,356 377 1.0
CO 0 0 1,769,981 14,522 92 1.0

covid_clean <- covid_tbl |>
  mutate(
    across(
      c(`7-day avg. cases`,`7-day avg. deaths`,Cases,Deaths),
      ~ as.integer(gsub(",","", .))
    ),
    `7-day avg. hospitalizations`          = as.integer(`7-day avg. hospitalizations`),
    `7-day avg. hospitalizations per 100k` = as.numeric(`7-day avg. hospitalizations per 100k`)
  )

4. Interactive Top-10 Table

covid_clean |>
  arrange(desc(`7-day avg. cases`)) |>
  slice_head(n = 10) |>
  gt() |>
  tab_header(
    title = md("**Top 10 States by 7-Day Average COVID-19 Cases**")
  ) |>
  fmt_number(
    columns = c(`7-day avg. cases`,`7-day avg. deaths`,Cases,Deaths),
    decimals = 0
  ) |>
  fmt_number(
    columns = c("7-day avg. hospitalizations per 100k"),
    decimals = 1
  ) |>
  cols_label(
    `7-day avg. cases` = "Avg Cases",
    `7-day avg. deaths` = "Avg Deaths",
    `7-day avg. hospitalizations per 100k` = "Hosp/100k"
  ) |>
  tab_options(
    table.font.size = px(14)
  )
Top 10 States by 7-Day Average COVID-19 Cases
State Avg Cases Avg Deaths Cases Deaths 7-day avg. hospitalizations Hosp/100k
NY 429 −37 6,706,390 77,423 177 0.9
MA 268 0 2,048,722 21,035 52 0.8
VA 210 0 2,323,255 23,769 204 2.0
WA 184 2 1,969,833 15,972 34 0.4
MI 157 3 3,119,532 43,191 67 0.7
CA 128 0 11,300,486 102,356 377 1.0
HI 96 0 393,757 1,955 27 2.0
NV 74 0 892,252 12,084 26 0.8
WI 50 0 2,036,872 16,723 79 1.0
ME 18 1 324,378 3,085 25 1.0

5. Publication-Ready Plots

5.1 Top 10 States by Avg Cases

p1 <- covid_clean |>
  top_n(10, `7-day avg. cases`) |>
  ggplot(aes(reorder(State, `7-day avg. cases`), `7-day avg. cases`)) +
    geom_col(fill = "steelblue") +
    coord_flip() +
    labs(
      title = "Top 10 States by 7-Day Average COVID-19 Cases",
      x     = "State",
      y     = "7-Day Average Cases"
    ) +
    theme_pubr() +
    theme(
      plot.title = element_text(face="bold", size=16),
      axis.title  = element_text(size=12)
    )

print(p1)

5.2 Cases vs Hospitalizations per 100 k

p2 <- covid_clean |>
  filter(!is.na(`7-day avg. hospitalizations per 100k`)) |>
  ggplot(aes(`7-day avg. cases`, `7-day avg. hospitalizations per 100k`)) +
    geom_point(size=3, alpha=0.7) +
    geom_smooth(method="lm", se=FALSE, linetype="dashed") +
    labs(
      title = "7-Day Avg Cases vs Hospitalizations per 100 k",
      x     = "7-Day Average Cases",
      y     = "Hospitalizations per 100 k"
    ) +
    theme_few() +
    theme(
      plot.title = element_text(face="bold", size=16)
    )

print(p2)

Another Rea–world Example

Below is an example relevant to students: scraping the QS World University Rankings 2025 from Wikipedia using tabela.

# Another influential URL: QS World University Rankings 2025
url2 <- "https://en.wikipedia.org/wiki/QS_World_University_Rankings"

# Start a polite session
session2 <- init_session(url2)
print(session2$url)  
[1] "https://en.wikipedia.org/wiki/QS_World_University_Rankings"
all_links2 <- get_all_links(session2)
length(all_links2)  
[1] 793
all_links2 |> 
  sample(20) |>
  knitr::kable()
x
https://en.wikipedia.org/w/index.php?title=QS_World_University_Rankings&action=history
https://en.wikipedia.org/wiki/QS_World_University_Rankings#cite_note-38
https://en.wikipedia.org/wiki/QS_World_University_Rankings#cite_note-28
https://en.wikipedia.org/wiki/Imperial_College_London
https://donate.wikimedia.org/?wmf_source=donate&wmf_medium=sidebar&wmf_campaign=en.wikipedia.org&uselang=en
https://en.wikipedia.org/wiki/U.S._News_&_World_Report_Best_Global_University_Ranking
https://en.wikipedia.org/wiki/QS_World_University_Rankings
https://en.wikipedia.org/wiki/QS_World_University_Rankings#cite_note-autogenerated1-21
https://en.wikipedia.org/wiki/Help%3ACategory
https://en.wikipedia.org/wiki/Hong_Kong
https://en.wikipedia.org/wiki/QS_World_University_Rankings#cite_note-15
https://en.wikipedia.org/wiki/QS_World_University_Rankings#cite_note-18
http://www.universityworldnews.com/article.php?story=2016062113365585
https://en.wikipedia.org/wiki/S2CID_(identifier)
https://en.wikipedia.org/wiki/QS_World_University_Rankings#cite_note-37
https://fr.wikipedia.org/wiki/Classement_mondial_des_universit%C3%A9s_QS
https://en.wikipedia.org/wiki/Australia
https://en.wikipedia.org/wiki/Universidad_de_Chile
https://en.wikipedia.org/wiki/Harvard_University
https://doi.org/10.5281%2Fzenodo.2592196
res2 <- scrape_page_data(
  root_url    = url2,
  exts        = character(),
  list_tables = TRUE
)
Scraped page 1 (1–0 of 0 files)
  → Found 10 HTML tables on this page
uni_tbl <- res2$tables[[2]]
uni_tbl |>
  knitr::kable()
Institution 2025 2024 2023 2022 2021 2020 2019 2018
MIT 1 1 1 1 1 1 1 1
Imperial College London 2 6 6 7 8 9 8 8
University of Oxford 3 3 4 2 5 4 5 6
Harvard University 4 4 5 5 3 3 3 3
University of Cambridge 5 2 2 3 7 7 6 5
Stanford University 6 5 3 3 2 2 2 2
ETH Zurich 7 7 9 8 6 6 7 10
National University of Singapore 8 8 11 11 11 11 11 15
University College London 9 9 8 8 10 8 10 7
California Institute of Technology 10 15 6 6 4 5 4 4
University of Pennsylvania 11 12 13 13 16 15 19 19
University of California, Berkeley 12 10 27 32 30 28 27 27
University of Melbourne 13 14 33 37 41 38 39 41
Peking University 14 17 12 18 23 22 30 38
Nanyang Technological University 15 26 19 12 13 11 12 11
Cornell University 16 13 20 21 18 14 14 14
University of Hong Kong 17 27 21 22 22 26 25 26
University of Sydney 18 20 41 38 40 42 42 50
University of New South Wales 19 19 45 43 44 43 45 45
Tsinghua University 20 25 14 17 15 16 17 25
University of Chicago 21 11 10 10 9 10 9 9
Princeton University 22 18 17 20 12 13 13 13
Yale University 23 16 18 15 17 17 15 16
Université PSL 24 24 26 44 52 53 50 NA
University of Toronto 25 21 35 26 26 30 28 31
inst_names    <- uni_tbl$Institution
inst_patterns <- inst_names %>% 
  # strip punctuation, collapse spaces to underscores
  str_remove_all("[^[:alnum:] ]") %>% 
  str_replace_all(" +", "_")
# 2. Build absolute Wiki URLs
links_df <- tibble(name = inst_names) %>%
  mutate(
    subpath = inst_patterns,
    url     = map_chr(subpath, ~ url_absolute(.x, "https://en.wikipedia.org/wiki/"))
  )

links_df <- links_df %>%
  # manual override for Université PSL
  mutate(
    url = if_else(
      name == "Université PSL",
      "https://en.wikipedia.org/wiki/Paris_Sciences_et_Lettres_University",
      url
    )
  )
get_coords <- function(name, url) {
  page    <- read_html(url)
  lat_dms <- page %>% html_element(".latitude")  %>% html_text2()
  lon_dms <- page %>% html_element(".longitude") %>% html_text2()
  dms2dec <- function(dms) {
    parts <- as.numeric(str_extract_all(dms, "[0-9]+\\.?[0-9]*")[[1]])
    dir   <- str_extract(dms, "[NSEW]")
    dec   <- parts[1] + parts[2]/60 + parts[3]/3600
    if (dir %in% c("S","W")) dec <- -dec
    dec
  }
  tibble(
    name = name,
    lat  = dms2dec(lat_dms),
    lon  = dms2dec(lon_dms)
  )
}


coords_df <- pmap_dfr(
  list(links_df$name, links_df$url),
  get_coords
)


final_df <- links_df %>%
  left_join(coords_df, by = "name")

final_df <- final_df %>%
  mutate(
    lat = case_when(
      name == "Imperial College London"           ~  51.498356,
      name == "Nanyang Technological University"  ~   1.3483099,
      name == "University of New South Wales"     ~ -33.917300,
      TRUE                                         ~ lat
    ),
    lon = case_when(
      name == "Imperial College London"           ~  -0.176894,
      name == "Nanyang Technological University"  ~ 103.6831347,
      name == "University of New South Wales"     ~ 151.225300,
      TRUE                                         ~ lon
    )
  )

final_df %>% knitr::kable()
name subpath url lat lon
MIT MIT https://en.wikipedia.org/wiki/MIT 42.359722 -71.0919444
Imperial College London Imperial_College_London https://en.wikipedia.org/wiki/Imperial_College_London 51.498356 -0.1768940
University of Oxford University_of_Oxford https://en.wikipedia.org/wiki/University_of_Oxford 51.755000 -1.2550000
Harvard University Harvard_University https://en.wikipedia.org/wiki/Harvard_University 42.374444 -71.1169444
University of Cambridge University_of_Cambridge https://en.wikipedia.org/wiki/University_of_Cambridge 52.205278 0.1172222
Stanford University Stanford_University https://en.wikipedia.org/wiki/Stanford_University 37.427500 -122.1700000
ETH Zurich ETH_Zurich https://en.wikipedia.org/wiki/ETH_Zurich 47.376389 8.5480556
National University of Singapore National_University_of_Singapore https://en.wikipedia.org/wiki/National_University_of_Singapore 1.295556 103.7766667
University College London University_College_London https://en.wikipedia.org/wiki/University_College_London 51.524722 -0.1336111
California Institute of Technology California_Institute_of_Technology https://en.wikipedia.org/wiki/California_Institute_of_Technology 34.137500 -118.1250000
University of Pennsylvania University_of_Pennsylvania https://en.wikipedia.org/wiki/University_of_Pennsylvania 39.950278 -75.1947222
University of California, Berkeley University_of_California_Berkeley https://en.wikipedia.org/wiki/University_of_California_Berkeley 37.871944 -122.2583333
University of Melbourne University_of_Melbourne https://en.wikipedia.org/wiki/University_of_Melbourne -37.796389 144.9613889
Peking University Peking_University https://en.wikipedia.org/wiki/Peking_University 39.989722 116.3052778
Nanyang Technological University Nanyang_Technological_University https://en.wikipedia.org/wiki/Nanyang_Technological_University 1.348310 103.6831347
Cornell University Cornell_University https://en.wikipedia.org/wiki/Cornell_University 42.449167 -76.4838889
University of Hong Kong University_of_Hong_Kong https://en.wikipedia.org/wiki/University_of_Hong_Kong 22.284167 114.1377778
University of Sydney University_of_Sydney https://en.wikipedia.org/wiki/University_of_Sydney -33.887500 151.1900000
University of New South Wales University_of_New_South_Wales https://en.wikipedia.org/wiki/University_of_New_South_Wales -33.917300 151.2253000
Tsinghua University Tsinghua_University https://en.wikipedia.org/wiki/Tsinghua_University 40.000000 116.3266667
University of Chicago University_of_Chicago https://en.wikipedia.org/wiki/University_of_Chicago 41.789722 -87.5997222
Princeton University Princeton_University https://en.wikipedia.org/wiki/Princeton_University 40.345278 -74.6561111
Yale University Yale_University https://en.wikipedia.org/wiki/Yale_University 41.316389 -72.9222222
Université PSL Université_PSL https://en.wikipedia.org/wiki/Paris_Sciences_et_Lettres_University 48.854667 2.3378056
University of Toronto University_of_Toronto https://en.wikipedia.org/wiki/University_of_Toronto 43.661667 -79.3950000
library(ggmap)
world_map <- get_map(location = c(0,20), zoom = 1, source = "google")

library(leaflet)
library(leaflet.extras2)

leaflet(final_df) %>%
  setView(lng = 0, lat = 20, zoom = 2) %>%
  
  # Base topo layer
  addProviderTiles(
    providers$Esri.WorldTopoMap,
    group = "Base"
  ) %>%
  
  # Opaque point markers
  addCircleMarkers(
    lng         = ~lon,
    lat         = ~lat,
    radius      = 6,
    color       = "#FFFFFF",
    fillColor   = "#2A9DF4",
    weight      = 2,
    fillOpacity = 0.9,
    opacity     = 1,
    group       = "Markers"
  ) %>%
  
  # Initial text labels (always visible)
  addLabelOnlyMarkers(
    lng     = ~lon,
    lat     = ~lat,
    label   = ~name,
    labelOptions = labelOptions(
      noHide   = TRUE,
      textOnly = TRUE,
      direction= "auto",
      opacity  = 1,
      style    = list(
        "color"       = "black",
        "font-weight" = "bold",
        "font-size"   = "6px"
      )
    ),
    group = "Labels"
  ) %>%
  
  # Layer controls for toggling
  addLayersControl(
    baseGroups    = c("Base"),
    overlayGroups = c("Markers", "Labels"),
    options       = layersControlOptions(collapsed = FALSE)
  )
# 3. Heat map with 2D density and Tableau-inspired fill
p_heat_aesthetic <- ggmap(world_map) +
  stat_density2d(
    data    = coords_df,
    aes(x = lon, y = lat, fill = ..level.., alpha = ..level..),
    geom    = "polygon",
    contour = TRUE
  ) +
  scale_fill_viridis_c(option = "magma") +            
  guides(alpha = FALSE) +
  theme_map(base_size = 12) +
  theme(
    plot.title   = element_text(face = "bold", size = 16),
    legend.position = "none"
  ) +
  labs(
    title = "Heat Map of QS 2025 University Density",
    fill  = "Density Level"
  )
p_heat_aesthetic

© 2023 Deepak Bastola

 

View source on GitHub