Land cover classification in the Tuckasegee Watershed¶
Background Information¶
This study will use the harmonized Sentinal/Landsat multispectral dataset to look at patterns in vegetation data. The HUC region 6 watershed covers the drainage of the Tennessee River Basin from Kentucky to the Gulf of Mexico. Most of the Tuckasegee River basin within this region is forested.¶
Sources¶
- Boundary Descriptions and Names of Regions, Subregions, Accounting Units and Cataloging Units from the 1987 USGS Water-Supply Paper 2294 https://water.usgs.gov/GIS/huc_name.html
- Tuckasegee River Subbasin HUC 06010203. NC DEQ
In [1]:
# Import needed packages
# Standard libraries
import os # for operating system manipulation
import pathlib # for managing files and directories
import pickle # enables serialization of objects for saving
import re # pattern matching in strings
import warnings # control how warnings are displayed
# Geospatial Packages
import cartopy.crs as ccrs # coordinate reference system for maps
import dask.dataframe as dd
import dask.array as da
import earthaccess # simplifies access to nasa earthdata
import earthpy as et # functions that work with raster and vector data
import geopandas as gpd # read and manipulate geo dataframes
import geoviews as gv # geospatial visualization
import hvplot.pandas # plots pandas dataframes
import hvplot.xarray # plots data arrays
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import numpy as np # numerican computation
import pandas as pd # tabular dataframes
import rioxarray as rxr # combine xarray with geospatial raster data
import rioxarray.merge as rxrmerge # merging multiple raster datasets
import seaborn as sns
from tqdm.notebook import tqdm # tracking processes with progress bar
import xarray as xr # gridded datasets
from shapely.geometry import Polygon # analyze geometric objects
from sklearn.cluster import KMeans # machine learning algorithm to group data
from sklearn.datasets import make_blobs
from sklearn.metrics import silhouette_samples, silhouette_score
# Environmental Variables
os.environ["GDAL_HTTP_MAX_RETRY"] = "5" # geospatial data abstraction library for website query
os.environ["GDAL_HTTP_RETRY_DELAY"] = "1" # combined lines try website 5 times with 1 second delay between
warnings.simplefilter('ignore') # suppress warnings
c:\Users\stem2\miniconda3\envs\earth-analytics-python\Lib\site-packages\dask\dataframe\__init__.py:42: FutureWarning: Dask dataframe query planning is disabled because dask-expr is not installed. You can install it with `pip install dask[dataframe]` or `conda install dask`. This will raise in a future version. warnings.warn(msg, FutureWarning)
Organize Information Downloads¶
In [2]:
# Setup data directories
data_dir = os.path.join(
pathlib.Path.home(),
'Documents',
'eaclassprojects',
'clusters',
'nc'
)
os.makedirs(data_dir, exist_ok=True)
data_dir
Out[2]:
'C:\\Users\\stem2\\Documents\\eaclassprojects\\clusters\\nc'
Using a Decorator to Cache Results¶
In [3]:
# Define decorator
def cached(func_key, override=False):
# save function results for retrieval and allow overwrite
"""
A decorator to cache function results
Parameters
==========
key: str
File basename used to save pickled results
override: bool
When True, re-compute even if the results are already stored
"""
# Wrap caching function
def compute_and_cache_decorator(compute_function):
"""
Wrap the caching function
Parameters
==========
compute_function: function
The function to run and cache results
"""
# detail usage of func_key
def compute_and_cache(*args, **kwargs):
"""
Perform a computation and cache, or load cached result.
Parameters
==========
args
Positional arguments for the compute function
kwargs
Keyword arguments for the compute function
"""
# Add an identifier from the particular function call
if 'cache_key' in kwargs:
key = '_'.join((func_key, kwargs['cache_key']))
else:
key = func_key
path = os.path.join(
et.io.HOME, et.io.DATA_NAME, 'tuck', f'{key}.pickle')
# Check if the cache exists already or override caching
if not os.path.exists(path) or override:
# Make tuck directory if needed
os.makedirs(os.path.dirname(path), exist_ok=True)
# Run the compute function as the user did
result = compute_function(*args, **kwargs)
# Pickle the object
with open(path, 'wb') as file:
pickle.dump(result, file)
else:
# Unpickle the object
with open(path, 'rb') as file:
result = pickle.load(file)
return result
return compute_and_cache
return compute_and_cache_decorator
Study Site¶
In [4]:
# Read watershed boundary dataset shapefile into a GeoDataFrame
@cached('wbd_06')
def read_wbd_file(wbd_filename, huc_level, cache_key):
# Download and unzip
wbd_url = (
"https://prd-tnm.s3.amazonaws.com"
"/StagedProducts/Hydrography/WBD/HU2/Shape/"
f"{wbd_filename}.zip")
wbd_dir = et.data.get_data(url=wbd_url)
# Read desired data
wbd_path = os.path.join(wbd_dir, 'Shape', f'WBDHU{huc_level}.shp')
wbd_gdf = gpd.read_file(wbd_path, engine='pyogrio')
return wbd_gdf
huc_level = 12 # identify HUC level
wbd_gdf = read_wbd_file(
"WBD_06_HU2_Shape", huc_level, cache_key=f'hu{huc_level}') # Call the function using cache
huc_column = f'huc{huc_level}'
if huc_column in wbd_gdf.columns:
tuck_gdf = (
wbd_gdf[wbd_gdf[huc_column].isin(['060102030303'])] # filter for specific river basin
.dissolve() # create a single polygon
)
if not tuck_gdf.empty:
(
tuck_gdf.to_crs(ccrs.Mercator()) # Reproject to Mercator for web mapping
.hvplot(
alpha=.2, fill_color='white', # set styling
tiles='EsriImagery', crs=ccrs.Mercator()) # add background map
.opts(title='Tuckasegee River Watershed, NC', width=600, height=300) # set plot dimensions
)
else:
print(f"Warning: no data found")
else:
print(f"Error: Column in gdf not found")
In [5]:
tuck_gdf.head()
Out[5]:
geometry | tnmid | metasource | sourcedata | sourceorig | sourcefeat | loaddate | referenceg | areaacres | areasqkm | ... | huc12 | name | hutype | humod | tohuc | noncontrib | noncontr_1 | shape_Leng | shape_Area | ObjectID | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | POLYGON ((-83.25331 35.37512, -83.2531 35.3747... | {D14E33D5-44C1-4BBA-B328-91586CF760C7} | {511D2AC8-11BA-45FC-AB98-F69D693D4C44} | Watershed Boundary Dataset (WBD) | Natural Resources and Conservation Service and... | None | 2024-08-15 | 989920,1016248 | 5429.31 | 21.97 | ... | 060102030303 | Mill Creek-Tuckasegee River | S | NM | 060102030304 | 0.0 | 0.0 | NaN | NaN | 768 |
1 rows × 21 columns
In [6]:
tuck_gdf.to_crs(ccrs.Mercator()).hvplot(
alpha=.2,
fill_color='red',
tiles='EsriImagery',
crs=ccrs.Mercator()
).opts(
title='Tuckasegee River Watershed, NC',
width=600,
height=300
)
Out[6]:
In [7]:
wbd_gdf.head()
Out[7]:
tnmid | metasource | sourcedata | sourceorig | sourcefeat | loaddate | referenceg | areaacres | areasqkm | states | ... | name | hutype | humod | tohuc | noncontrib | noncontr_1 | shape_Leng | shape_Area | ObjectID | geometry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | {B2398188-7360-43E5-9734-34C4C2C0ED75} | {511D2AC8-11BA-45FC-AB98-F69D693D4C44} | Watershed Boundary Dataset (WBD) | Natural Resources and Conservation Service and... | None | 2024-08-15 | 125931,151915 | 19065.85 | 77.16 | AL | ... | Rorex Creek-Jones Creek | S | NM | 060300010408 | 0.0 | 0.0 | NaN | NaN | 1 | POLYGON ((-85.7839 34.70814, -85.78404 34.7080... |
1 | {ADD4C622-05E7-4EE0-A37C-7A7561819CD1} | {511D2AC8-11BA-45FC-AB98-F69D693D4C44} | Watershed Boundary Dataset (WBD) | Natural Resources and Conservation Service and... | None | 2024-08-15 | 1272000,1269346 | 18132.07 | 73.38 | TN | ... | Sycamore Creek-Big Sycamore Creek | S | KA | 060102050903 | NaN | NaN | NaN | NaN | 2 | POLYGON ((-83.28651 36.54126, -83.28664 36.541... |
2 | {7FDF1CF1-714A-46C3-98F9-3CDEE8CE962E} | {511D2AC8-11BA-45FC-AB98-F69D693D4C44} | Watershed Boundary Dataset (WBD) | Natural Resources and Conservation Service and... | None | 2024-08-15 | 123559 | 33164.34 | 134.21 | AL | ... | Lower Mud Creek | S | NM | 060300010408 | 0.0 | 0.0 | NaN | NaN | 3 | POLYGON ((-85.91333 34.84872, -85.91333 34.848... |
3 | {D7AED0DA-C3E5-4AE3-9989-EDD6A77FF1A7} | {511D2AC8-11BA-45FC-AB98-F69D693D4C44} | Watershed Boundary Dataset (WBD) | Natural Resources and Conservation Service and... | None | 2024-08-15 | 116557 | 27750.14 | 112.30 | AL | ... | Coon Creek | S | NM | 060300010408 | 0.0 | 0.0 | NaN | NaN | 4 | POLYGON ((-85.73206 34.82984, -85.73252 34.829... |
4 | {70D79B1B-2E9E-4B38-AEA9-6CB0A10F391B} | {511D2AC8-11BA-45FC-AB98-F69D693D4C44} | Watershed Boundary Dataset (WBD) | Natural Resources and Conservation Service and... | None | 2024-08-15 | 1485282,1487063 | 28196.00 | 114.11 | VA | ... | McDonald Branch-North Fork Holston River | S | KA | 060101010103 | 0.0 | 0.0 | NaN | NaN | 5 | POLYGON ((-81.37268 36.96393, -81.37284 36.963... |
5 rows × 21 columns
Access Multispectral Data¶
In [8]:
# Log in to earthaccess
earthaccess.login(persist=True)
# Search for HLS tiles
results = earthaccess.search_data(
short_name="HLSL30",
cloud_hosted=True,
bounding_box=tuple(tuck_gdf.total_bounds),
temporal=("2023-06", "2023-09"),
)
# Confirm the contents
num_granules =len(results)
print(f"Number of granules found: {num_granules}")
print(results[0])
Number of granules found: 60 Collection: {'EntryTitle': 'HLS Landsat Operational Land Imager Surface Reflectance and TOA Brightness Daily Global 30m v2.0'} Spatial coverage: {'HorizontalSpatialDomain': {'Geometry': {'GPolygons': [{'Boundary': {'Points': [{'Longitude': -83.19517003, 'Latitude': 35.13500195}, {'Longitude': -81.99027924, 'Latitude': 35.15084307}, {'Longitude': -82.00257131, 'Latitude': 36.140702}, {'Longitude': -83.22239647, 'Latitude': 36.12427601}, {'Longitude': -83.19517003, 'Latitude': 35.13500195}]}}]}}} Temporal coverage: {'RangeDateTime': {'BeginningDateTime': '2023-06-01T16:05:15.499Z', 'EndingDateTime': '2023-06-01T16:05:39.386Z'}} Size(MB): 198.7151746749878 Data: ['https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSL30.020/HLS.L30.T17SLV.2023152T160515.v2.0/HLS.L30.T17SLV.2023152T160515.v2.0.B09.tif', 'https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSL30.020/HLS.L30.T17SLV.2023152T160515.v2.0/HLS.L30.T17SLV.2023152T160515.v2.0.B05.tif', 'https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSL30.020/HLS.L30.T17SLV.2023152T160515.v2.0/HLS.L30.T17SLV.2023152T160515.v2.0.B10.tif', 'https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSL30.020/HLS.L30.T17SLV.2023152T160515.v2.0/HLS.L30.T17SLV.2023152T160515.v2.0.VAA.tif', 'https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSL30.020/HLS.L30.T17SLV.2023152T160515.v2.0/HLS.L30.T17SLV.2023152T160515.v2.0.B07.tif', 'https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSL30.020/HLS.L30.T17SLV.2023152T160515.v2.0/HLS.L30.T17SLV.2023152T160515.v2.0.SAA.tif', 'https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSL30.020/HLS.L30.T17SLV.2023152T160515.v2.0/HLS.L30.T17SLV.2023152T160515.v2.0.VZA.tif', 'https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSL30.020/HLS.L30.T17SLV.2023152T160515.v2.0/HLS.L30.T17SLV.2023152T160515.v2.0.B01.tif', 'https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSL30.020/HLS.L30.T17SLV.2023152T160515.v2.0/HLS.L30.T17SLV.2023152T160515.v2.0.SZA.tif', 'https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSL30.020/HLS.L30.T17SLV.2023152T160515.v2.0/HLS.L30.T17SLV.2023152T160515.v2.0.B03.tif', 'https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSL30.020/HLS.L30.T17SLV.2023152T160515.v2.0/HLS.L30.T17SLV.2023152T160515.v2.0.B06.tif', 'https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSL30.020/HLS.L30.T17SLV.2023152T160515.v2.0/HLS.L30.T17SLV.2023152T160515.v2.0.B04.tif', 'https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSL30.020/HLS.L30.T17SLV.2023152T160515.v2.0/HLS.L30.T17SLV.2023152T160515.v2.0.B11.tif', 'https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSL30.020/HLS.L30.T17SLV.2023152T160515.v2.0/HLS.L30.T17SLV.2023152T160515.v2.0.Fmask.tif', 'https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/HLSL30.020/HLS.L30.T17SLV.2023152T160515.v2.0/HLS.L30.T17SLV.2023152T160515.v2.0.B02.tif']
In [9]:
results = earthaccess.search_data(
short_name="HLSL30",
cloud_hosted=True,
bounding_box=tuple(tuck_gdf.total_bounds),
temporal=("2023-05", "2023-09"),
)
print(type(results)) # Prints the type of the 'results' object
<class 'list'>
Compile Information about each Granule¶
In [10]:
# Extract and organize Earthaccess data
def get_earthaccess_links(results):
url_re = re.compile(
r'\.(?P<tile_id>\w+)\.\d+T\d+\.v\d\.\d\.(?P<band>[A-Za-z0-9]+)\.tif')
# Loop through each granule
link_rows = [] # Initialize a list to hold GeoDataFrames
url_dfs = [] # Initialize a list to hold individual file data
for granule in tqdm(results):
# Get granule metadata information
info_dict = granule['umm']
granule_id = info_dict['GranuleUR']
datetime = pd.to_datetime(
info_dict
['TemporalExtent']['RangeDateTime']['BeginningDateTime'])
points = ( # Extract polygon coordinates
info_dict
['SpatialExtent']['HorizontalSpatialDomain']['Geometry']['GPolygons'][0]
['Boundary']['Points'])
geometry = Polygon( # Create a Shapely Polygon object
[(point['Longitude'], point['Latitude']) for point in points])
# Get data files associated with the granule
files = earthaccess.open([granule])
# Loop through each file within the granule
for file in files:
match = url_re.search(file.full_name)
if match is not None:
# Create a GeoDataFrame for the current file and append to list
link_rows.append(
gpd.GeoDataFrame(
dict(
datetime=[datetime],
tile_id=[match.group('tile_id')], # Extract tile ID
band=[match.group('band')], # Extract band name
url=[file], # Store the file object
geometry=[geometry] # Store the geometry
),
crs="EPSG:4326" # set coordinate reference system
)
)
# Concatenate GeoDataFrames into a single gdf
file_df = pd.concat(link_rows).reset_index(drop=True) # combine all rows
return file_df
results = earthaccess.search_data(
short_name="HLSL30",
cloud_hosted=True,
bounding_box=tuple(tuck_gdf.total_bounds),
temporal=("2023-06", "2023-09"),
)
if results: # Check if results is not empty
first_result = results[0] # Get the first result
file_df = get_earthaccess_links([first_result]) # Pass a list containing only the first result to the function
print(file_df.head())
else:
print("No results found.")
0%| | 0/1 [00:00<?, ?it/s]
QUEUEING TASKS | : 0%| | 0/15 [00:00<?, ?it/s]
PROCESSING TASKS | : 0%| | 0/15 [00:00<?, ?it/s]
COLLECTING RESULTS | : 0%| | 0/15 [00:00<?, ?it/s]
datetime tile_id band \ 0 2023-06-01 16:05:15.499000+00:00 T17SLV B09 1 2023-06-01 16:05:15.499000+00:00 T17SLV B05 2 2023-06-01 16:05:15.499000+00:00 T17SLV B10 3 2023-06-01 16:05:15.499000+00:00 T17SLV VAA 4 2023-06-01 16:05:15.499000+00:00 T17SLV B07 url \ 0 <File-like object HTTPFileSystem, https://data... 1 <File-like object HTTPFileSystem, https://data... 2 <File-like object HTTPFileSystem, https://data... 3 <File-like object HTTPFileSystem, https://data... 4 <File-like object HTTPFileSystem, https://data... geometry 0 POLYGON ((-83.19517 35.135, -81.99028 35.15084... 1 POLYGON ((-83.19517 35.135, -81.99028 35.15084... 2 POLYGON ((-83.19517 35.135, -81.99028 35.15084... 3 POLYGON ((-83.19517 35.135, -81.99028 35.15084... 4 POLYGON ((-83.19517 35.135, -81.99028 35.15084...
Open, Crop, and Mask Data using a Cached Decorator¶
In [11]:
@cached('tuck_reflectance_da_df')
def compute_reflectance_da(search_results, boundary_gdf):
"""
Connect to files over VSI, crop, cloud mask, and wrangle.
Returns a single reflectance DataFrame with all bands as columns and
centroid coordinates and datetime as the index.
Parameters
==========
file_df : pd.DataFrame
File connection and metadata (datetime, tile_id, band, and url)
boundary_gdf : gpd.GeoDataFrame
Boundary use to crop the data
"""
def open_dataarray(url, boundary_proj_gdf, scale=1, masked=True):
# Open masked DataArray
da = rxr.open_rasterio(url, masked=masked).squeeze() * scale
# Reproject boundary if needed
if boundary_proj_gdf is None:
boundary_proj_gdf = boundary_gdf.to_crs(da.rio.crs)
# Crop
cropped = da.rio.clip_box(*boundary_proj_gdf.total_bounds)
return cropped
def compute_quality_mask(da, mask_bits=[1, 2, 3]):
"""Mask out low quality data by bit."""
# Unpack bits into a new axis
bits = (
np.unpackbits(
da.astype(np.uint8), bitorder='little'
).reshape(da.shape + (-1,))
)
# Select the required bits and check if any are flagged
mask = np.prod(bits[..., mask_bits]==0, axis=-1)
return mask
file_df = get_earthaccess_links(search_results)
granule_da_rows = []
boundary_proj_gdf = None
# Loop through each image
group_iter = file_df.groupby(['datetime', 'tile_id'])
for (datetime, tile_id), granule_df in tqdm(group_iter):
print(f'Processing granule {tile_id} {datetime}')
# Open granule cloud cover
cloud_mask_url = (
granule_df.loc[granule_df.band=='Fmask', 'url']
.values[0])
cloud_mask_cropped_da = open_dataarray(cloud_mask_url, boundary_proj_gdf, masked=False)
# Compute cloud mask
cloud_mask = compute_quality_mask(cloud_mask_cropped_da)
# Loop through each spectral band
da_list = []
df_list = []
for i, row in granule_df.iterrows():
if row.band.startswith('B'):
# Open, crop, and mask the band
band_cropped = open_dataarray(
row.url, boundary_proj_gdf, scale=0.0001)
band_cropped.name = row.band
# Add the DataArray to the metadata DataFrame row
row['da'] = band_cropped.where(cloud_mask)
granule_da_rows.append(row.to_frame().T)
# Chunking for memory efficiency
chunk_size = 100 # Adjust as needed
result_chunks = []
for i in range(0, len(granule_da_rows), chunk_size):
chunk = granule_da_rows[i:i + chunk_size]
result_chunks.append(pd.concat(chunk))
final_result = pd.concat(result_chunks)
return final_result
tuck_reflectance_da_df = compute_reflectance_da(results, tuck_gdf)
tuck_reflectance_da_df.head()
Out[11]:
datetime | tile_id | band | url | geometry | da | |
---|---|---|---|---|---|---|
17 | 2023-06-01 16:05:15.499000+00:00 | T17SKV | B04 | <File-like object HTTPFileSystem, https://data... | POLYGON ((-83.72416443 35.12425967, -83.087891... | [[<xarray.DataArray 'B04' ()> Size: 4B\narray(... |
18 | 2023-06-01 16:05:15.499000+00:00 | T17SKV | B07 | <File-like object HTTPFileSystem, https://data... | POLYGON ((-83.72416443 35.12425967, -83.087891... | [[<xarray.DataArray 'B07' ()> Size: 4B\narray(... |
21 | 2023-06-01 16:05:15.499000+00:00 | T17SKV | B03 | <File-like object HTTPFileSystem, https://data... | POLYGON ((-83.72416443 35.12425967, -83.087891... | [[<xarray.DataArray 'B03' ()> Size: 4B\narray(... |
22 | 2023-06-01 16:05:15.499000+00:00 | T17SKV | B09 | <File-like object HTTPFileSystem, https://data... | POLYGON ((-83.72416443 35.12425967, -83.087891... | [[<xarray.DataArray 'B09' ()> Size: 4B\narray(... |
24 | 2023-06-01 16:05:15.499000+00:00 | T17SKV | B01 | <File-like object HTTPFileSystem, https://data... | POLYGON ((-83.72416443 35.12425967, -83.087891... | [[<xarray.DataArray 'B01' ()> Size: 4B\narray(... |
Merge and Composite Data¶
In [12]:
@cached('tuck_reflectance_da') # Function output stored, same input gives cached output improving performance
def merge_and_composite_arrays(granule_da_df):
# Merge and composite and image for each band
df_list = [] # List to store DataFrames
da_list = [] # List to store the composited DataArrays for each band
for band, band_df in tqdm(granule_da_df.groupby('band')): # Iterate through each band
merged_das = [] # List to store merged DAs for each date within the current band
for datetime, date_df in tqdm(band_df.groupby('datetime')): # Iterate through each date within the current band
# Merge granules for each date
merged_da = rxrmerge.merge_arrays(list(date_df.da))
# Mask negative values
merged_da = merged_da.where(merged_da>0)
# Add the merged DA for the current date to the list
merged_das.append(merged_da)
# Composite images across dates with median reflectance value for each pixel reducing the effect of cloud cover.
composite_da = xr.concat(merged_das, dim='datetime').median('datetime')
composite_da['band'] = int(band[1:]) # Converts band name to integer
composite_da.name = 'reflectance' # Sets the name of the DataArray
da_list.append(composite_da) # Add the composite DA for the current band to the list
# Concatenate the composite DataArrays for all bands into a single DataArray
return xr.concat(da_list, dim='band')
tuck_reflectance_da = merge_and_composite_arrays(tuck_reflectance_da_df) # Call the function to process the reflectance data
tuck_reflectance_da[0] # Display the first resulting reflectance DataArray
0%| | 0/10 [00:00<?, ?it/s]
0%| | 0/30 [00:00<?, ?it/s]
0%| | 0/30 [00:00<?, ?it/s]
0%| | 0/30 [00:00<?, ?it/s]
0%| | 0/30 [00:00<?, ?it/s]
0%| | 0/30 [00:00<?, ?it/s]
0%| | 0/30 [00:00<?, ?it/s]
0%| | 0/30 [00:00<?, ?it/s]
0%| | 0/30 [00:00<?, ?it/s]
0%| | 0/30 [00:00<?, ?it/s]
0%| | 0/30 [00:00<?, ?it/s]
Out[12]:
<xarray.DataArray 'reflectance' (y: 186, x: 292)> Size: 217kB array([[0.0173 , 0.0122 , 0.0138 , ..., 0.0329 , 0.02705, 0.0285 ], [0.02035, 0.0161 , 0.0147 , ..., 0.0284 , 0.02405, 0.0277 ], [0.01725, 0.0122 , 0.0134 , ..., 0.0267 , 0.0219 , 0.0164 ], ..., [0.0182 , 0.0227 , 0.0219 , ..., 0.02135, 0.02555, 0.0278 ], [0.01935, 0.0209 , 0.0221 , ..., 0.0216 , 0.0247 , 0.0278 ], [0.01955, 0.0162 , 0.0153 , ..., 0.01725, 0.0183 , 0.0206 ]], dtype=float32) Coordinates: * x (x) float64 2kB 2.939e+05 2.94e+05 ... 3.026e+05 3.027e+05 * y (y) float64 1kB 3.917e+06 3.917e+06 ... 3.912e+06 3.911e+06 band int64 8B 1 spatial_ref int64 8B 0
Analyze using K-MEANS¶
In [16]:
### how many different types of vegetation are there?
# Convert spectral DataArray to DataFrame
model_df = tuck_reflectance_da.to_dataframe().reflectance.unstack('band')
model_df = model_df.drop(columns=[10, 11]).dropna()
model_df.head()
Out[16]:
band | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 9 | |
---|---|---|---|---|---|---|---|---|---|
y | x | ||||||||
3917025.0 | 293925.0 | 0.0173 | 0.0202 | 0.0447 | 0.0244 | 0.3751 | 0.1667 | 0.0628 | 0.0012 |
293955.0 | 0.0122 | 0.0155 | 0.0351 | 0.0223 | 0.1678 | 0.0759 | 0.0313 | 0.0012 | |
293985.0 | 0.0138 | 0.0180 | 0.0351 | 0.0310 | 0.1667 | 0.0959 | 0.0460 | 0.0012 | |
294015.0 | 0.0175 | 0.0235 | 0.0501 | 0.0400 | 0.3075 | 0.1918 | 0.0945 | 0.0012 | |
294045.0 | 0.0239 | 0.0264 | 0.0518 | 0.0365 | 0.3165 | 0.1866 | 0.0854 | 0.0013 |
In [17]:
# Running the fit and predict functions at the same time.
# We can do this since we don't have target data.
prediction = KMeans(n_clusters=6).fit_predict(model_df.values)
# Add the predicted values back to the model DataFrame
model_df['clusters'] = prediction
model_df
Out[17]:
band | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 9 | clusters | |
---|---|---|---|---|---|---|---|---|---|---|
y | x | |||||||||
3917025.0 | 293925.0 | 0.01730 | 0.02020 | 0.04470 | 0.02440 | 0.37510 | 0.16670 | 0.06280 | 0.0012 | 0 |
293955.0 | 0.01220 | 0.01550 | 0.03510 | 0.02230 | 0.16780 | 0.07590 | 0.03130 | 0.0012 | 1 | |
293985.0 | 0.01380 | 0.01800 | 0.03510 | 0.03100 | 0.16670 | 0.09590 | 0.04600 | 0.0012 | 1 | |
294015.0 | 0.01750 | 0.02350 | 0.05010 | 0.04000 | 0.30750 | 0.19180 | 0.09450 | 0.0012 | 5 | |
294045.0 | 0.02390 | 0.02640 | 0.05180 | 0.03650 | 0.31650 | 0.18660 | 0.08540 | 0.0013 | 5 | |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
3911475.0 | 302535.0 | 0.01310 | 0.01950 | 0.04745 | 0.03040 | 0.33955 | 0.15270 | 0.06685 | 0.0012 | 2 |
302565.0 | 0.01305 | 0.01495 | 0.03500 | 0.01605 | 0.34540 | 0.13385 | 0.05060 | 0.0010 | 2 | |
302595.0 | 0.01725 | 0.01855 | 0.03675 | 0.01890 | 0.33700 | 0.13670 | 0.05455 | 0.0010 | 2 | |
302625.0 | 0.01830 | 0.02030 | 0.04130 | 0.02235 | 0.35000 | 0.14685 | 0.05905 | 0.0011 | 2 | |
302655.0 | 0.02060 | 0.02190 | 0.04475 | 0.02390 | 0.38795 | 0.15685 | 0.06280 | 0.0010 | 0 |
54312 rows × 9 columns
Plot Reflectance¶
In [22]:
# Select bands from reflectance data array
rgb = tuck_reflectance_da.sel(band=[4, 3, 2]) # band 4=red, 3=green and 2=blue
rgb_uint8 = (rgb * 255).astype(np.uint8).where(rgb!=np.nan) # convert to integers (0-255)
rgb_bright = rgb_uint8 * 10 # increase brightness
rgb_sat = rgb_bright.where(rgb_bright < 255, 255) # no pixel exceeds 255
# Create a composite RGB image plot in square
(
rgb_sat.hvplot.rgb(
x='x', y='y', bands='band',
data_aspect=1,
xaxis=None, yaxis=None,
title="RGB Composite Cluster Overlay of the Tuckasegee River" # Add title here
)
+ # Overlay cluster data
model_df.clusters.to_xarray().sortby(['x', 'y']).hvplot(
cmap="colorblind", aspect='equal', xaxis=None, yaxis=None)
)
Out[22]: