Portland Asthma Study¶
There are multiple risk factors for asthma including social, environmental and financial. [1] Portland Oregon is an urban area which takes pride in protecting people from environment exposure to toxic chemicals from fuel exhaust among other sources. It has 92,000 acres of green spaces and "...the highest rate of biking to work of any major US city."[2] This study will model rates of adult asthma in proximity to urban greenspaces using CDC asthma data [3] merged with a Normalized Difference Vegetation Index (NDVI) to look at the geographic distribution of landscape metrics such as mean patch size, edge density, and fragmentation. A study of an area in southern India undergoing rapid urbanization showed a "...positive relationship[s] between tree cover and various indicators of wellbeing..." at both household and neighborhood levels [4]. In Portland Oregon, there is an uneven distribution of potential tree canopy "...with areas east of the Willamette River containing 82% of the city’s potential tree canopy, compared to 18% west of the river." [5] There is evidence that fine particulate matter near airports is a cause of respiratory issues [6] I expect to see geographic patterns in the model that support higher rates of asthma near the airport and other areas with low potential for planting trees.¶
Sources¶
- Malik HU, Kumar K, Frieri M. Minimal difference in the prevalence of asthma in the urban and rural environment. Clin Med Insights Pediatr. 2012 Jun 19;6:33-9. https://journals.sagepub.com/doi/10.4137/CMPed.S9539
- Green City Times, available online at: https://www.greencitytimes.com/portland/
- CDC Places data Portal
- Thapa, P., Torralba, M., Nölke, N. et al. Disentangling associations of human wellbeing with green infrastructure, degree of urbanity, and social factors around an Asian megacity. 39, 152 (2024). https://doi.org/10.1007/s10980-024-01937-6
- Jeff Ramsey and Angie DiSalvo. Tree Canopy and Potential in Portland, Oregon. A report by Portland Parks & Recreation Urban Forestry. February 2018.
- Johnson K, Solet D, Serry K. Community Health and Airport Operations Related Noise and Air Pollution: Report to the Legislature in Response to Washington State HOUSE BILL 1109. December 1, 2020. Public Health Seattle & King County; Assessment, Policy Development and Evaluation Unit.
In [ ]:
# Import packages for modeling and statistical analysis of spatial data
# This section imports all the necessary Python libraries for the project.
import os # for operating system manipulation
import pathlib # for managing files and directories
import time # for running code at specific intervals
import warnings # control how warnings are displayed
import geopandas as gpd # read and manipulate geo dataframes
import geoviews as gv # geospatial visualization
import holoviews as hv # visualization
import hvplot.pandas # plots pandas dataframes
import hvplot.xarray # plots data arrays
import matplotlib.pyplot as plt
import numpy as np # numerican computation
import pandas as pd # tabular dataframes
import pystac_client
import requests
import rioxarray as rxr # combine xarray with geospatial raster data
import rioxarray.merge as rxrmerge # merging multiple raster datasets
import scipy.stats as stats
import shapely # analyze geometric objects
import statsmodels.api as sm
import xarray as xr
from cartopy import crs as ccrs # For map projections
from holoviews import opts # Holoviews options
from holoviews.element import tiles # For background map tiles
from scipy.ndimage import convolve #combine two functions into a third function
from sklearn.model_selection import KFold # For cross-validation
from scipy.ndimage import label # For connected component labeling
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler # For data scaling
from sklearn.decomposition import PCA # For dimensionality reduction
from sklearn.model_selection import train_test_split # For splitting data into training and testing sets
from tqdm.notebook import tqdm # For progress bars in notebooks
# Define the directory where data will be stored.
data_dir = os.path.join(
pathlib.Path.home(),
'Documents',
'eaclassprojects',
'bigdata',
'portland'
)
os.makedirs(data_dir, exist_ok=True) # Creates the directory if it doesn't exist.
warnings.simplefilter('ignore') # Suppress warnings.
# Prevent GDAL from quitting due to momentary disruptions
os.environ["GDAL_HTTP_MAX_RETRY"] = "5" # Sets the maximum number of retries to 5
os.environ["GDAL_HTTP_RETRY_DELAY"] = "1" # Sets the delay between retries to 1 second
c:\Users\stem2\miniconda3\envs\earth-analytics-python\Lib\site-packages\dask\dataframe\__init__.py:42: FutureWarning: Dask dataframe query planning is disabled because dask-expr is not installed. You can install it with `pip install dask[dataframe]` or `conda install dask`. This will raise in a future version. warnings.warn(msg, FutureWarning)
In [ ]:
# Set up the census tract path
tract_dir = os.path.join(data_dir, 'oregon-tract') # create a subdirectory
os.makedirs(tract_dir, exist_ok=True) # prevents an error if the directory already exists.
tract_path = os.path.join(tract_dir, 'oregon-tract.shp') # Defines the full path to the shapefile.
# Download the census tracts (only once)
if not os.path.exists(tract_path):
tract_url = ('https://data.cdc.gov/download/x7zy-2xmx/application%2Fzip')
tract_gdf = gpd.read_file(tract_url)
po_tract_gdf = tract_gdf[tract_gdf.PlaceName=='Portland']
po_tract_gdf.to_file(tract_path, index=False)
# Load in the census tract data
po_tract_gdf = gpd.read_file(tract_path)
# Site plot -- Census tracts with satellite imagery in the background
(
po_tract_gdf
.to_crs(ccrs.Mercator())
.hvplot(
line_color='orange', fill_color=None,
crs=ccrs.Mercator(), tiles='EsriImagery',
frame_width=600)
)
po_tract_gdf.head()
# Please note that this geodataframe was created before online data was scrubbed, therefore the location error remains
Out[ ]:
place2010 | tract2010 | ST | PlaceName | plctract10 | PlcTrPop10 | geometry | |
---|---|---|---|---|---|---|---|
0 | 2360545 | 23005000100 | 23 | Portland | 2360545-23005000100 | 2346 | MULTIPOLYGON (((-7819264.773 5414638.107, -781... |
1 | 2360545 | 23005000200 | 23 | Portland | 2360545-23005000200 | 2349 | POLYGON ((-7819262.929 5413477.756, -7819269.5... |
2 | 2360545 | 23005000300 | 23 | Portland | 2360545-23005000300 | 2625 | POLYGON ((-7819262.929 5413477.756, -7819348.6... |
3 | 2360545 | 23005000500 | 23 | Portland | 2360545-23005000500 | 2420 | MULTIPOLYGON (((-7820627.306 5414284.438, -782... |
4 | 2360545 | 23005000600 | 23 | Portland | 2360545-23005000600 | 3389 | POLYGON ((-7820810.983 5413214.16, -7820836.03... |
In [ ]:
# Set up a path for asthma data
cdc_path = os.path.join(data_dir, 'asthma.csv')
# Download the census tracts (only once)
if not os.path.exists(tract_path): # Check if the tract file exists.
tract_url = ('https://data.cdc.gov/download/x7zy-2xmx/application%2Fzip')
tract_gdf = gpd.read_file(tract_url)
# Download asthma data (only once)
if not os.path.exists(cdc_path):
cdc_url = (
"https://data.cdc.gov/resource/cwsq-ngmh.csv" # CDC API endpoint.
"?year=2022" # Filter for data from 2022.
"&stateabbr=OR" # Filter for Oregon.
"&countyname=Portland" # Filter for Portland.
"&measureid=CASTHMA" # Filter for asthma prevalence data.
"&$limit=1500" # Limit the number of returned records.
)
cdc_df = (
pd.read_csv(cdc_url) # Reads the data from the URL into a pandas DataFrame.
.rename(columns={ # Rename columns for clarity.
'data_value': 'asthma',
'low_confidence_limit': 'asthma_ci_low',
'high_confidence_limit': 'asthma_ci_high',
'locationname': 'tract'})
[[ # Select and order the desired columns.
'year', 'tract',
'asthma', 'asthma_ci_low', 'asthma_ci_high', 'data_value_unit',
'totalpopulation', 'totalpop18plus', 'geolocation'
]]
)
cdc_df.to_csv(cdc_path, index=False) # Saves the DataFrame to a CSV file.
# Load asthma data
cdc_df = pd.read_csv(cdc_path)
# Preview asthma data
cdc_df
Out[ ]:
year | tract | asthma | asthma_ci_low | asthma_ci_high | data_value_unit | totalpopulation | totalpop18plus | |
---|---|---|---|---|---|---|---|---|
0 | 2022 | 41051000202 | 11.4 | 10.1 | 12.8 | % | 3456 | 2875 |
1 | 2022 | 41051000702 | 10.8 | 9.6 | 12.1 | % | 5243 | 4268 |
2 | 2022 | 41051000602 | 11.5 | 10.2 | 12.9 | % | 5756 | 4612 |
3 | 2022 | 41051000402 | 10.3 | 9.1 | 11.5 | % | 3906 | 3188 |
4 | 2022 | 41051000401 | 10.4 | 9.2 | 11.6 | % | 3746 | 3023 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
192 | 2022 | 41051008904 | 11.2 | 10.0 | 12.6 | % | 5006 | 3829 |
193 | 2022 | 41051001602 | 10.7 | 9.5 | 12.0 | % | 4616 | 3786 |
194 | 2022 | 41051001302 | 10.5 | 9.4 | 11.8 | % | 3376 | 2823 |
195 | 2022 | 41051001202 | 10.3 | 9.1 | 11.5 | % | 3342 | 2804 |
196 | 2022 | 41051000902 | 11.2 | 10.0 | 12.5 | % | 4910 | 4157 |
197 rows × 8 columns
In [ ]:
# Change tract identifier datatype for merging
po_tract_gdf.tract2010 = po_tract_gdf.tract2010.astype('int64') # conver column to integer
# Merge census data with geometry
tract_cdc_gdf = (
po_tract_gdf
.merge(cdc_df, left_on='tract2010', right_on='tract', how='inner') # use only tracts present in *both* datasets
)
# Plot asthma data as chloropleth
(
gv.tile_sources.EsriImagery # Adds Esri satellite imagery as the base map.
*
gv.Polygons( # polygons represent the census tracts.
tract_cdc_gdf.to_crs(ccrs.Mercator()), # Reprojects to Mercator for display.
vdims=['asthma', 'tract2010'], # specifies dimensions
crs=ccrs.Mercator() # Sets the coordinate reference system for the polygons.
).opts(color='asthma', colorbar=True, tools=['hover']) # Applies styling and options to the Polygons
).opts(title='Asthma Rates', width=600, height=600)
# Merge census data with geometry
tract_cdc_gdf = (
po_tract_gdf
.merge(cdc_df, left_on='tract2010', right_on='tract', how='inner')
)
# Plot asthma data as chloropleth
(
gv.tile_sources.EsriImagery
*
gv.Polygons(
tract_cdc_gdf.to_crs(ccrs.Mercator()),
vdims=['asthma', 'tract2010'],
crs=ccrs.Mercator()
).opts(color='asthma', colorbar=True, tools=['hover'])
).opts(title='Asthma Rates', width=600, height=600)
Out[ ]:
In [6]:
tract_cdc_gdf.head()
Out[6]:
place2010 | tract2010 | ST | PlaceName | plctract10 | PlcTrPop10 | geometry | year | tract | asthma | asthma_ci_low | asthma_ci_high | data_value_unit | totalpopulation | totalpop18plus | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 4159000 | 41051000301 | 41 | Portland | 4159000-41051000301 | 5041 | POLYGON ((-13650305.147 5699042.371, -13650193... | 2022 | 41051000301 | 11.2 | 10.0 | 12.6 | % | 5443 | 4908 |
1 | 4159000 | 41051000302 | 41 | Portland | 4159000-41051000302 | 6709 | POLYGON ((-13650063.25 5697279.135, -13649943.... | 2022 | 41051000302 | 10.1 | 9.0 | 11.3 | % | 7191 | 5476 |
2 | 4159000 | 41051000401 | 41 | Portland | 4159000-41051000401 | 3418 | POLYGON ((-13648772.389 5699042.688, -13648767... | 2022 | 41051000401 | 10.4 | 9.2 | 11.6 | % | 3746 | 3023 |
3 | 4159000 | 41051000402 | 41 | Portland | 4159000-41051000402 | 3477 | POLYGON ((-13647657.747 5699037.289, -13647659... | 2022 | 41051000402 | 10.3 | 9.1 | 11.5 | % | 3906 | 3188 |
4 | 4159000 | 41051000501 | 41 | Portland | 4159000-41051000501 | 3732 | POLYGON ((-13646585.183 5696754.598, -13646694... | 2022 | 41051000501 | 11.1 | 9.9 | 12.4 | % | 4134 | 3307 |
In [ ]:
# Connect to the planetary computer catalog for SpatioTemporal Data
e84_catalog = pystac_client.Client.open(
"https://planetarycomputer.microsoft.com/api/stac/v1"
)
e84_catalog.title
Out[ ]:
'Microsoft Planetary Computer STAC API'
In [7]:
# Convert geometry to lat/lon for STAC
tract_area_gdf = tract_cdc_gdf.to_crs(4326)
# Define a path to save NDVI stats
ndvi_stats_path = os.path.join(data_dir, 'po-ndvi-stats.csv')
# Check for existing data - do not access duplicate tracts
downloaded_tracts = []
if os.path.exists(ndvi_stats_path):
ndvi_stats_df = pd.read_csv(ndvi_stats_path)
downloaded_tracts = ndvi_stats_df.tract.values
else:
print('No census tracts downloaded so far')
tract_cdc_gdf.head()
No census tracts downloaded so far
Out[7]:
place2010 | tract2010 | ST | PlaceName | plctract10 | PlcTrPop10 | geometry | year | tract | asthma | asthma_ci_low | asthma_ci_high | data_value_unit | totalpopulation | totalpop18plus | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 4159000 | 41051000301 | 41 | Portland | 4159000-41051000301 | 5041 | POLYGON ((-13650305.147 5699042.371, -13650193... | 2022 | 41051000301 | 11.2 | 10.0 | 12.6 | % | 5443 | 4908 |
1 | 4159000 | 41051000302 | 41 | Portland | 4159000-41051000302 | 6709 | POLYGON ((-13650063.25 5697279.135, -13649943.... | 2022 | 41051000302 | 10.1 | 9.0 | 11.3 | % | 7191 | 5476 |
2 | 4159000 | 41051000401 | 41 | Portland | 4159000-41051000401 | 3418 | POLYGON ((-13648772.389 5699042.688, -13648767... | 2022 | 41051000401 | 10.4 | 9.2 | 11.6 | % | 3746 | 3023 |
3 | 4159000 | 41051000402 | 41 | Portland | 4159000-41051000402 | 3477 | POLYGON ((-13647657.747 5699037.289, -13647659... | 2022 | 41051000402 | 10.3 | 9.1 | 11.5 | % | 3906 | 3188 |
4 | 4159000 | 41051000501 | 41 | Portland | 4159000-41051000501 | 3732 | POLYGON ((-13646585.183 5696754.598, -13646694... | 2022 | 41051000501 | 11.1 | 9.9 | 12.4 | % | 4134 | 3307 |
In [ ]:
# Convert geometry to lat/lon for STAC
tract_latlon_gdf = tract_cdc_gdf.to_crs(4326)
# Define a path to save NDVI stats
ndvi_stats_path = os.path.join(data_dir, 'po_ndvi_stats.csv')
# Check for existing data - do not access duplicate tracts
downloaded_tracts = []
if os.path.exists(ndvi_stats_path):
ndvi_stats_df = pd.read_csv(ndvi_stats_path) # Read existing NDVI stats.
downloaded_tracts = ndvi_stats_df.tract.values # Get list of processed tracts.
else:
print('No census tracts downloaded so far')
# Loop through each census tract
scene_dfs = [] # Initialize an empty list to store DataFrames of scene URLs.
for i, tract_values in tqdm(tract_latlon_gdf.iterrows()):
tract = tract_values.tract2010 # Get the census tract ID.
# Check if statistics are already downloaded for this tract
if not (tract in downloaded_tracts):
# Retry up to 5 times in case of a momentary disruption
i = 0
retry_limit = 5
while i < retry_limit:
# Try accessing the STAC
try:
# Search for tiles
naip_search = e84_catalog.search( # National Agriculture Imagery Program collection
collections=["naip"],
intersects=shapely.to_geojson(tract_values.geometry), # Convert tract geometry to GeoJSON for STAC query.
datetime="2022"
)
# Build dataframe with tracts and tile urls
scene_dfs.append(pd.DataFrame(dict(
tract=tract,
date=[pd.to_datetime(scene.datetime).date()
for scene in naip_search.items()], # Extract date from STAC item.
## Extract URL from STAC item.
rgbir_href=[scene.assets['image'].href for scene in naip_search.items()],
)))
break # Exit the retry loop if the STAC query was successful.
# Try again in case of an APIError
except pystac_client.exceptions.APIError: # Catch potential STAC API errors.
print(
f'Could not connect with STAC server. '
f'Retrying tract {tract}...')
time.sleep(2) # Wait for 2 seconds before retrying.
i += 1
continue
# Concatenate the url dataframes
if scene_dfs:
scene_df = pd.concat(scene_dfs).reset_index(drop=True)
else:
scene_df = None
# Preview the URL DataFrame
scene_df
0it [00:00, ?it/s]
Out[ ]:
tract | date | rgbir_href | |
---|---|---|---|
0 | 41051007202 | 2022-07-11 | https://naipeuwest.blob.core.windows.net/naip/... |
1 | 41051007202 | 2022-07-11 | https://naipeuwest.blob.core.windows.net/naip/... |
2 | 41051007202 | 2022-07-11 | https://naipeuwest.blob.core.windows.net/naip/... |
3 | 41051007202 | 2022-07-11 | https://naipeuwest.blob.core.windows.net/naip/... |
4 | 41051007202 | 2022-06-26 | https://naipeuwest.blob.core.windows.net/naip/... |
... | ... | ... | ... |
74 | 41051010200 | 2022-07-11 | https://naipeuwest.blob.core.windows.net/naip/... |
75 | 41051980000 | 2022-07-11 | https://naipeuwest.blob.core.windows.net/naip/... |
76 | 41051980000 | 2022-07-11 | https://naipeuwest.blob.core.windows.net/naip/... |
77 | 41051980000 | 2022-07-11 | https://naipeuwest.blob.core.windows.net/naip/... |
78 | 41051009605 | 2022-07-11 | https://naipeuwest.blob.core.windows.net/naip/... |
79 rows × 3 columns
In [ ]:
# Skip this step if data are already downloaded
if not scene_df is None: # following applies for new tracts to process
ndvi_dfs = [] # Initialize an empty list to store NDVI DataArrays.
# Loop through the census tracts with URLs
for tract, tract_date_gdf in tqdm(scene_df.groupby('tract')):
# Open all images for tract
tile_das = [] # Initialize an empty list to store DataArrays for each tile.
for _, href_s in tract_date_gdf.iterrows():
# Open vsi connection to data
tile_da = rxr.open_rasterio( # open raster data from the URL using rioxarray.
href_s.rgbir_href, masked=True).squeeze() # Open and mask the raster data
# Clip data
boundary = (
tract_cdc_gdf # Get the tract geometry.
.set_index('tract2010')
.loc[[tract]] # Select the geometry for the current tract.
.to_crs(tile_da.rio.crs) # Reproject the geometry to the raster's CRS.
.geometry
)
crop_da = tile_da.rio.clip_box( # crop to bounding box for efficiency
*boundary.envelope.total_bounds,
auto_expand=True)
clip_da = crop_da.rio.clip(boundary, all_touched=True) # Clip to the exact tract boundary.
# Compute NDVI
ndvi_da = (
(clip_da.sel(band=4) - clip_da.sel(band=1)) # Near-infrared - red.
/ (clip_da.sel(band=4) + clip_da.sel(band=1)) # Divide by Near-infrared + red.
)
)
# Accumulate result
tile_das.append(ndvi_da) # Adds the calculated NDVI DataArray to the list.
# Merge data
scene_da = rxrmerge.merge_arrays(tile_das)
# Mask vegetation
veg_mask = (scene_da>.3)
# Calculate statistics and save data to file
total_pixels = scene_da.notnull().sum()
veg_pixels = veg_mask.sum()
# Calculate mean patch size
labeled_patches, num_patches = label(veg_mask)
# Count patch pixels, ignoring background at patch 0
patch_sizes = np.bincount(labeled_patches.ravel())[1:]
mean_patch_size = patch_sizes.mean()
# Calculate edge density
kernel = np.array([ # Define a kernel for edge detection.
[1, 1, 1],
[1, -8, 1],
[1, 1, 1]])
edges = convolve(veg_mask, kernel, mode='constant') # Convolve the mask to find edges.
edge_density = np.sum(edges != 0) / veg_mask.size # Calculate edge density.
# Add a row to the statistics file for this tract
pd.DataFrame(dict(
tract=[tract],
total_pixels=[int(total_pixels)],
frac_veg=[float(veg_pixels/total_pixels)], # Fraction of vegetated pixels.
mean_patch_size=[mean_patch_size],
edge_density=[edge_density]
)).to_csv(
ndvi_stats_path,
mode='a', # Append to the file.
index=False,
header=(not os.path.exists(ndvi_stats_path)) # Write header only if the file is new.
)
# Re-load results from file into a pandas dataframe
ndvi_stats_df = pd.read_csv(ndvi_stats_path)
ndvi_stats_df
0%| | 0/35 [00:00<?, ?it/s]
In [ ]:
# Check dataframe contents
ndvi_stats_df = pd.read_csv(ndvi_stats_path)
ndvi_stats_df
Out[ ]:
tract | total_pixels | frac_veg | mean_patch_size | edge_density | |
---|---|---|---|---|---|
0 | 41051000301 | 37910355 | 0.225860 | 130.324303 | 0.127315 |
1 | 41051000302 | 41854872 | 0.344999 | 159.488132 | 0.190479 |
2 | 41051000401 | 16672710 | 0.253196 | 163.344219 | 0.148197 |
3 | 41051000402 | 13986849 | 0.222779 | 98.904079 | 0.165858 |
4 | 41051000501 | 14558120 | 0.184343 | 83.117319 | 0.145830 |
... | ... | ... | ... | ... | ... |
85 | 41051006801 | 12023343 | 0.417062 | 196.653908 | 0.201007 |
86 | 41051006802 | 46861129 | 0.263234 | 390.065646 | 0.095664 |
87 | 41051006900 | 83355432 | 0.235718 | 327.904357 | 0.076315 |
88 | 41051007100 | 5552098 | 0.239320 | 552.945901 | 0.052000 |
89 | 41051007201 | 128596450 | 0.029843 | 87.629592 | 0.019367 |
90 rows × 5 columns
In [ ]:
# Merge census data with geometry
ndvi_cdc_gdf = (
tract_cdc_gdf
.merge(
ndvi_stats_df,
left_on='tract2010', right_on='tract', how='inner') # only tracts present in *both* DataFrames are included
)
# Plot chloropleths with vegetation statistics
def plot_chloropleth(gdf, **opts):
"""Generate a chloropleth with the given color column"""
return gv.Polygons(
gdf.to_crs(ccrs.Mercator()), # Reproject the GeoDataFrame to Mercator projection.
crs=ccrs.Mercator() # Set the coordinate reference system for the plot.
).opts(colorbar=True, **opts) # Add a colorbar and other options.
(
plot_chloropleth(
ndvi_cdc_gdf, title='Asthma Rates', color='asthma', cmap='viridis')
+ # Combine two maps into one plot
plot_chloropleth(ndvi_cdc_gdf, title='Vegetation Index', color='edge_density', cmap='Greens')
)
Out[ ]:
In [ ]:
# Variable selection and model transformation
model_df = (
ndvi_cdc_gdf
.copy() # Create a copy to avoid modifying the original DataFrame.
[[ # Select the desired columns.
'frac_veg', # Fraction of vegetated pixels.
'asthma', # Asthma rate.
'mean_patch_size', # Mean vegetation patch size.
'edge_density', # Vegetation edge density.
'geometry' # Geometry of the census tracts.
]]
.dropna() # Remove rows with missing values (NaN).
)
model_df['log_asthma'] = np.log(model_df.asthma) # Create a new column for the log of the 'asthma' values.
# Plot scatter matrix to identify variables that need transformation
hvplot.scatter_matrix(
model_df
[[
'mean_patch_size',
'edge_density',
'log_asthma'
]]
)
Out[ ]:
In [ ]:
# Select predictor x (independent variables) and outcome y variables for the linear regression model.
X = model_df[['edge_density', 'frac_veg']]
y = model_df[['log_asthma']]
# Split into training (33%) and testing datasets
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42) # ensures that the split is reproducible.
# Fit a linear regression
reg = LinearRegression()
reg.fit(X_train, y_train) # Train the model.
# Predict asthma values for the test dataset
y_test['pred_asthma'] = np.exp(reg.predict(X_test)) # Predict log-asthma and convert back to asthma.
y_test['asthma'] = np.exp(y_test.log_asthma) # Exponentiate the actual log_asthma values for comparison.
# Plot measured vs. predicted asthma prevalence with a 1-to-1 line
y_max = y_test.asthma.max()
(
y_test
.hvplot.scatter(
x='asthma', y='pred_asthma',
xlabel='Measured Adult Asthma Prevalence',
ylabel='Predicted Adult Asthma Prevalence',
title='Linear Regression Performance - Testing Data'
)
.opts(aspect='equal', xlim=(0, y_max), ylim=(0, y_max), height=600, width=600)
) * hv.Slope(slope=1, y_intercept=0).opts(color='black')
Out[ ]:
In [ ]:
# Compute model error for all census tracts
model_df['pred_asthma'] = np.exp(reg.predict(X)) # Predict asthma for all tracts and convert from log scale.
model_df['err_asthma'] = model_df['pred_asthma'] - model_df['asthma'] # Calculate the prediction error.
# Plot error geographically as a chloropleth
(
plot_chloropleth(model_df, title='Model Error Map for Portland, Oregon', color='err_asthma', cmap='RdBu')
.redim.range(err_asthma=(-.3, .3))
.opts(frame_width=600, aspect='equal')
)
# The color of each census tract represents the magnitude and direction of the error.
# Red colors typically indicate overestimation (predicted value higher than actual), while blue colors
# indicate underestimation (predicted value lower than actual).
Out[ ]: