import re
from datetime import datetime
from glob import glob
from pathlib import Path
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import netCDF4 as nc
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import earthaccess
# For the timeseries section
from scipy import spatial
# For the timeseries section, to compare with EPA Air Quality data
import pyrsig
# Fot the animation sectionExploring aerosol data from DSCOVR EPIC Level 2 Aerosol Version 3
Summary
We investigate the time lag between wildfire smoke detected by satellite scans and the corresponding readings from ground-based sensors. This analysis is crucial for understanding the behavior and transport of wildfire smoke plumes as they descend from the atmosphere to ground level, impacting air quality. We utilize satellite-derived Aerosol Index (AI) data, which indicates the presence of aerosols in the atmosphere, and ground-based Particulate Matter (PM) measurements, particularly PM2.5, to quantify smoke concentrations at the surface. To analyze the time lag, we first normalize the AI units from satellite data and PM2.5 units from the ground sensors to create comparable time series. By shifting the ground sensor data over a range of time intervals, we aim to identify the optimal lag where the ground-based PM2.5 readings align with peaks in the satellite-derived AI values. This approach allows us to estimate the delay between smoke detection in the upper atmosphere and its eventual impact on air quality at the surface. Various statistical methods, such as cross-correlation, are employed to quantify this time lag and improve the predictive understanding of smoke dispersion patterns.
Dataset Information
“DSCOVR_EPIC_L2_AER_03 is the Deep Space Climate Observatory (DSCOVR) Enhanced Polychromatic Imaging Camera (EPIC) Level 2 UV Aerosol Version 3 data product. Observations for this data product are at 340 and 388 nm and are used to derive near UV (ultraviolet) aerosol properties. The EPIC aerosol retrieval algorithm (EPICAERUV) uses a set of aerosol models to account for the presence of carbonaceous aerosols from biomass burning and wildfires (BIO), desert dust (DST), and sulfate-based (SLF) aerosols. These aerosol models are identical to those assumed in the OMI (Ozone Monitoring Instrument) algorithm (Torres et al., 2007; Jethva and Torres, 2011).” (Source)
Prerequisites
This notebook requires these libraries and files: - cartopy - earthaccess - netCDF4-python - numpy - matplotlib - pyrsig - scipy - tqdm - xarray - Data granules from DSCOVR EPIC Level-2 to have been downloaded into a local directory.
Section 1:
1. Import Required Packages
2. Download data files
This section defines the time, and location to pull the data files from. After running once there is no reason to run this again. Also this section can be altered to change the type of data I am pulling
data_dir = Path("data/dscovr/").resolve()auth = earthaccess.login()
# are we authenticated?
if not auth.authenticated:
# ask for credentials and persist them in a .netrc file
auth.login(strategy="interactive", persist=True)collection_id = "C1962643459-LARC_ASDC"
# Bounds within which we search for data granules
date_start = "2023-06-02 00:00"
date_end = "2023-06-15 23:59"
date_range = (date_start, date_end)
bbox = (-79, 34, -73, 40) # minLon, minLat, maxLon, maxLat
# extent = [-79, -73, 35, 39.5] # extend = [minLon, maxLon, minLat, maxLat]
# Find the data granules and their links
results = earthaccess.search_data(
concept_id=collection_id,
temporal=date_range,
bounding_box=bbox,
)!!! Warning !!! This next cell can take a long time. It needs to be run once, but then should be commented-out if the data files are already in a local directory.
# Download the files
downloaded_files = earthaccess.download(
results,
local_path=data_dir,
)# Return the list of files that are present
pattern = re.compile(
r"DSCOVR_EPIC_L2_AER_03_([0-9]+)_03"
) # the capture group is a datetime timestamp
file_list = list(data_dir.glob("DSCOVR_EPIC_L2_AER_*.he5"))
num_files = len(file_list)
# These are the paths to groups *within* each HDF file
data_fields_path = "HDFEOS/SWATHS/Aerosol NearUV Swath/Data Fields"
geolocations_path = "HDFEOS/SWATHS/Aerosol NearUV Swath/Geolocation Fields"3. Wrangling data into Zarr stores
In this section, the HDF files are converted to Zarr stores to enable faster access for data manipulation later in the notebook.
def convert_hdf_to_xrds(filepath: Path, group_path: str) -> xr.Dataset:
"""Converts a HDF group into an xarray Dataset with an added time dimension
Parameters
----------
filepath : pathlib.Path
The path to a DSCOVR EPIC HDF file
group_path : str
The internal path to a group, e.g., 'HDFEOS/SWATHS/Aerosol NearUV Swath/Data Fields'
"""
# The filename's datetime string is converted into a datetime timestamp
timestamp = pattern.findall(str(fp.stem))[0]
timestamp_dt = datetime.strptime(timestamp, "%Y%m%d%H%M%S")
# Note: the netCDF library works to access these datasets, so we just use it instead of h5py.
with nc.Dataset(filepath) as ds:
grp = ds[group_path]
# The HDF group is converted into an xarray Dataset object, and then
# a new singleton 'time' dimension is added to the Dataset with the timestamp as its only value.
grp_ds = xr.open_dataset(xr.backends.NetCDF4DataStore(grp)).expand_dims(
time=[timestamp_dt], axis=0
)
return grp_dsdef convert_nc_to_xrds(filepath: Path, group_path: str) -> xr.Dataset:
"""Converts a HDF group into an xarray Dataset with an added time dimension
Parameters
----------
filepath : pathlib.Path
The path to a DSCOVR EPIC HDF file
group_path : str
The internal path to a group, e.g., 'HDFEOS/SWATHS/Aerosol NearUV Swath/Data Fields'
"""
# The filename's datetime string is converted into a datetime timestamp
timestamp = pattern.findall(str(fp.stem))[0]
timestamp_dt = datetime.strptime(timestamp, "%Y%m%d%H%M%S")
# Note: the netCDF library works to access these datasets, so we just use it instead of h5py.
with nc.Dataset(filepath) as ds:
grp = ds[group_path]
# The HDF group is converted into an xarray Dataset object, and then
# a new singleton 'time' dimension is added to the Dataset with the timestamp as its only value.
grp_ds = xr.open_dataset(xr.backends.NetCDF4DataStore(grp)).expand_dims(
time=[timestamp_dt], axis=0
)
return grp_ds!!! Warning !!! This next cell can take a long time. It needs to be run once, but then should be commented-out if the zarr files already exist.
print("Extracting groups...")
idcs_to_do = range(0, len(file_list))
for idx in tqdm(idcs_to_do, total=len(idcs_to_do)):
fp = file_list[idx]
(
convert_hdf_to_xrds(fp, data_fields_path).rename(
{
"phony_dim_0": "XDim",
"phony_dim_1": "YDim",
"phony_dim_2": "nLayers",
"phony_dim_3": "nWave3",
"phony_dim_4": "nWave2",
}
)
).merge(
(
convert_nc_to_xrds(fp, geolocations_path).rename(
{"phony_dim_5": "XDim", "phony_dim_6": "YDim"}
)
)
).to_zarr(f"zStore01/zarr_2024_07_09_#{idx:03}.zarr")
# if idx > 2:
# break
print("Done extracting groups.")Extracting groups...
--------------------------------------------------------------------------- ContainsGroupError Traceback (most recent call last) Cell In[11], line 18 4 for idx in tqdm(idcs_to_do, total=len(idcs_to_do)): 5 fp = file_list[idx] 6 ( 7 convert_hdf_to_xrds(fp, data_fields_path) 8 .rename({"phony_dim_0": "XDim", 9 "phony_dim_1": "YDim", 10 "phony_dim_2": "nLayers", 11 "phony_dim_3": "nWave3" , 12 "phony_dim_4": "nWave2"}) 13 ).merge( 14 (convert_nc_to_xrds(fp, geolocations_path) 15 .rename({"phony_dim_5": "XDim", 16 "phony_dim_6": "YDim"}) 17 ) ---> 18 ).to_zarr(f"zStore01/zarr_2024_07_09_#{idx:03}.zarr") 20 #if idx > 2: 21 # break 22 print("Done extracting groups.") File /opt/homebrew/Caskroom/miniconda/base/envs/temp-dscovr-baeza-notebook/lib/python3.12/site-packages/xarray/core/dataset.py:2270, in Dataset.to_zarr(self, store, chunk_store, mode, synchronizer, group, encoding, compute, consolidated, append_dim, region, safe_chunks, storage_options, zarr_version, zarr_format, write_empty_chunks, chunkmanager_store_kwargs) 2102 """Write dataset contents to a zarr group. 2103 2104 Zarr chunks are determined in the following way: (...) 2266 The I/O user guide, with more details and examples. 2267 """ 2268 from xarray.backends.api import to_zarr -> 2270 return to_zarr( # type: ignore[call-overload,misc] 2271 self, 2272 store=store, 2273 chunk_store=chunk_store, 2274 storage_options=storage_options, 2275 mode=mode, 2276 synchronizer=synchronizer, 2277 group=group, 2278 encoding=encoding, 2279 compute=compute, 2280 consolidated=consolidated, 2281 append_dim=append_dim, 2282 region=region, 2283 safe_chunks=safe_chunks, 2284 zarr_version=zarr_version, 2285 zarr_format=zarr_format, 2286 write_empty_chunks=write_empty_chunks, 2287 chunkmanager_store_kwargs=chunkmanager_store_kwargs, 2288 ) File /opt/homebrew/Caskroom/miniconda/base/envs/temp-dscovr-baeza-notebook/lib/python3.12/site-packages/xarray/backends/api.py:2217, in to_zarr(dataset, store, chunk_store, mode, synchronizer, group, encoding, compute, consolidated, append_dim, region, safe_chunks, storage_options, zarr_version, zarr_format, write_empty_chunks, chunkmanager_store_kwargs) 2214 already_consolidated = False 2215 consolidate_on_close = consolidated or consolidated is None -> 2217 zstore = backends.ZarrStore.open_group( 2218 store=mapper, 2219 mode=mode, 2220 synchronizer=synchronizer, 2221 group=group, 2222 consolidated=already_consolidated, 2223 consolidate_on_close=consolidate_on_close, 2224 chunk_store=chunk_mapper, 2225 append_dim=append_dim, 2226 write_region=region, 2227 safe_chunks=safe_chunks, 2228 zarr_version=zarr_version, 2229 zarr_format=zarr_format, 2230 write_empty=write_empty_chunks, 2231 **kwargs, 2232 ) 2234 dataset = zstore._validate_and_autodetect_region(dataset) 2235 zstore._validate_encoding(encoding) File /opt/homebrew/Caskroom/miniconda/base/envs/temp-dscovr-baeza-notebook/lib/python3.12/site-packages/xarray/backends/zarr.py:732, in ZarrStore.open_group(cls, store, mode, synchronizer, group, consolidated, consolidate_on_close, chunk_store, storage_options, append_dim, write_region, safe_chunks, zarr_version, zarr_format, use_zarr_fill_value_as_mask, write_empty, cache_members) 707 @classmethod 708 def open_group( 709 cls, (...) 725 cache_members: bool = True, 726 ): 727 ( 728 zarr_group, 729 consolidate_on_close, 730 close_store_on_close, 731 use_zarr_fill_value_as_mask, --> 732 ) = _get_open_params( 733 store=store, 734 mode=mode, 735 synchronizer=synchronizer, 736 group=group, 737 consolidated=consolidated, 738 consolidate_on_close=consolidate_on_close, 739 chunk_store=chunk_store, 740 storage_options=storage_options, 741 zarr_version=zarr_version, 742 use_zarr_fill_value_as_mask=use_zarr_fill_value_as_mask, 743 zarr_format=zarr_format, 744 ) 746 return cls( 747 zarr_group, 748 mode, (...) 756 cache_members, 757 ) File /opt/homebrew/Caskroom/miniconda/base/envs/temp-dscovr-baeza-notebook/lib/python3.12/site-packages/xarray/backends/zarr.py:1845, in _get_open_params(store, mode, synchronizer, group, consolidated, consolidate_on_close, chunk_store, storage_options, zarr_version, use_zarr_fill_value_as_mask, zarr_format) 1841 if _zarr_v3(): 1842 # we have determined that we don't want to use consolidated metadata 1843 # so we set that to False to avoid trying to read it 1844 open_kwargs["use_consolidated"] = False -> 1845 zarr_group = zarr.open_group(store, **open_kwargs) 1847 close_store_on_close = zarr_group.store is not store 1849 # we use this to determine how to handle fill_value File /opt/homebrew/Caskroom/miniconda/base/envs/temp-dscovr-baeza-notebook/lib/python3.12/site-packages/zarr/hierarchy.py:1593, in open_group(store, mode, cache_attrs, synchronizer, path, chunk_store, storage_options, zarr_version, meta_array) 1591 raise ContainsArrayError(path) 1592 elif contains_group(store, path=path): -> 1593 raise ContainsGroupError(path) 1594 else: 1595 init_group(store, path=path, chunk_store=chunk_store) ContainsGroupError: path '' contains a group
4. Now Open the dataset
This will load the Zarr files from the local saved, and then look at some of the values
# mfds = xr.open_mfdataset(glob("zStore01/zarr_2024_07_09_#*.zarr"),
# engine='zarr', combine='by_coords')
# mfdsmfds = xr.open_mfdataset(glob("data/dscovr/DSCOVR*.he5"), engine="netcdf4", combine="by_coords")
mfds--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[10], line 1 ----> 1 mfds = xr.open_mfdataset(glob("data/dscovr/DSCOVR*.he5"), 2 engine='netcdf4', combine='by_coords') 3 mfds File /opt/homebrew/Caskroom/miniconda/base/envs/temp-dscovr-baeza-notebook/lib/python3.12/site-packages/xarray/backends/api.py:1663, in open_mfdataset(paths, chunks, concat_dim, compat, preprocess, engine, data_vars, coords, combine, parallel, join, attrs_file, combine_attrs, **kwargs) 1650 combined = _nested_combine( 1651 datasets, 1652 concat_dims=concat_dim, (...) 1658 combine_attrs=combine_attrs, 1659 ) 1660 elif combine == "by_coords": 1661 # Redo ordering from coordinates, ignoring how they were ordered 1662 # previously -> 1663 combined = combine_by_coords( 1664 datasets, 1665 compat=compat, 1666 data_vars=data_vars, 1667 coords=coords, 1668 join=join, 1669 combine_attrs=combine_attrs, 1670 ) 1671 else: 1672 raise ValueError( 1673 f"{combine} is an invalid option for the keyword argument ``combine``" 1674 ) File /opt/homebrew/Caskroom/miniconda/base/envs/temp-dscovr-baeza-notebook/lib/python3.12/site-packages/xarray/structure/combine.py:983, in combine_by_coords(data_objects, compat, data_vars, coords, fill_value, join, combine_attrs) 979 grouped_by_vars = groupby_defaultdict(data_objects, key=vars_as_keys) 981 # Perform the multidimensional combine on each group of data variables 982 # before merging back together --> 983 concatenated_grouped_by_data_vars = tuple( 984 _combine_single_variable_hypercube( 985 tuple(datasets_with_same_vars), 986 fill_value=fill_value, 987 data_vars=data_vars, 988 coords=coords, 989 compat=compat, 990 join=join, 991 combine_attrs=combine_attrs, 992 ) 993 for vars, datasets_with_same_vars in grouped_by_vars 994 ) 996 return merge( 997 concatenated_grouped_by_data_vars, 998 compat=compat, (...) 1001 combine_attrs=combine_attrs, 1002 ) File /opt/homebrew/Caskroom/miniconda/base/envs/temp-dscovr-baeza-notebook/lib/python3.12/site-packages/xarray/structure/combine.py:984, in <genexpr>(.0) 979 grouped_by_vars = groupby_defaultdict(data_objects, key=vars_as_keys) 981 # Perform the multidimensional combine on each group of data variables 982 # before merging back together 983 concatenated_grouped_by_data_vars = tuple( --> 984 _combine_single_variable_hypercube( 985 tuple(datasets_with_same_vars), 986 fill_value=fill_value, 987 data_vars=data_vars, 988 coords=coords, 989 compat=compat, 990 join=join, 991 combine_attrs=combine_attrs, 992 ) 993 for vars, datasets_with_same_vars in grouped_by_vars 994 ) 996 return merge( 997 concatenated_grouped_by_data_vars, 998 compat=compat, (...) 1001 combine_attrs=combine_attrs, 1002 ) File /opt/homebrew/Caskroom/miniconda/base/envs/temp-dscovr-baeza-notebook/lib/python3.12/site-packages/xarray/structure/combine.py:645, in _combine_single_variable_hypercube(datasets, fill_value, data_vars, coords, compat, join, combine_attrs) 639 if len(datasets) == 0: 640 raise ValueError( 641 "At least one Dataset is required to resolve variable names " 642 "for combined hypercube." 643 ) --> 645 combined_ids, concat_dims = _infer_concat_order_from_coords(list(datasets)) 647 if fill_value is None: 648 # check that datasets form complete hypercube 649 _check_shape_tile_ids(combined_ids) File /opt/homebrew/Caskroom/miniconda/base/envs/temp-dscovr-baeza-notebook/lib/python3.12/site-packages/xarray/structure/combine.py:158, in _infer_concat_order_from_coords(datasets) 152 tile_ids = [ 153 tile_id + (position,) 154 for tile_id, position in zip(tile_ids, order, strict=True) 155 ] 157 if len(datasets) > 1 and not concat_dims: --> 158 raise ValueError( 159 "Could not find any dimension coordinates to use to " 160 "order the datasets for concatenation" 161 ) 163 combined_ids = dict(zip(tile_ids, datasets, strict=True)) 165 return combined_ids, concat_dims ValueError: Could not find any dimension coordinates to use to order the datasets for concatenation
5. Creating ColorMaps
In the Original they built off of the Turbo color pallatte, but here I have defined four more color pallates that are colorblind inclusive
# Neccesary Imports
from matplotlib.colors import ListedColormap, LinearSegmentedColormap# Muted ColorMap
muted_cmap_name = "muted_cmap"
muted_qualitative_colors = [
# (51/256, 34/256, 136/256), # Indigo
# (136/256, 204/256, 238/256), # Cyan
(68 / 256, 170 / 256, 153 / 256), # Teal
(17 / 256, 119 / 256, 51 / 256), # Green
(153 / 256, 153 / 256, 51 / 256), # Olive
(221 / 256, 204 / 256, 119 / 256), # Sand
(204 / 256, 102 / 256, 119 / 256), # Rose
(136 / 256, 34 / 256, 85 / 256), # Wine
(170 / 256, 68 / 256, 153 / 256),
] # Purple
muted_cm = LinearSegmentedColormap.from_list(muted_cmap_name, muted_qualitative_colors)
newcolors = muted_cm(np.linspace(0, 1, 256))
silver_color = np.array([0.95, 0.95, 0.95, 1])
newcolors[0, :] = silver_color
muted_cmap = ListedColormap(newcolors)
muted_cmap6. Define Mapping Functions, Define total Extent, and Choose Data Variable
This section defines the Extent for the later graphs, declares the data variable to look at, and then defines two utility functions.
# This utility function will simplify some code blocks later.
def get_geo_mask(lon, lat):
lon_mask = (lon > extent[0]) & (lon < extent[1])
lat_mask = (lat > extent[2]) & (lat < extent[3])
return lon_mask & lat_mask
# This utility function will simplify some code blocks later.
def get_data_arrays(a_timestep: int):
geo_mask = (
get_geo_mask(mfds["Longitude"][a_timestep, :, :], mfds["Latitude"][a_timestep, :, :])
.compute()
.values
)
lon_values = mfds["Longitude"][a_timestep, :, :].where(geo_mask).values
lat_values = mfds["Latitude"][a_timestep, :, :].where(geo_mask).values
var_values = mfds[data_variable][a_timestep, :, :].where(geo_mask).values
timestamp = datetime.strptime(
np.datetime_as_string(mfds["time"][a_timestep].values, unit="ms"), "%Y-%m-%dT%H:%M:%S.%f"
)
# Replace any +inf or -inf values with NaN.
var_values[np.isinf(var_values)] = np.nan
return lon_values, lat_values, var_values, timestamp# Defines Extent for greater specificatoin
extent = [-79, -73, 35, 39.5] # extend = [minLon, maxLon, minLat, maxLat]
central_lon = np.mean(extent[:2])
central_lat = np.mean(extent[2:])# Declare the desired Data Variable
data_variable = "UVAerosolIndex"7. Make A Before and During Timestep UVAI Map
This section creates timestep maps for before and during the peak of the smoke extent in hampton roads.
# country boundaries
country_bodr = cfeature.NaturalEarthFeature(
category="cultural",
name="admin_0_boundary_lines_land",
scale="50m",
facecolor="none",
edgecolor="k",
)
# province boundaries
provinc_bodr = cfeature.NaturalEarthFeature(
category="cultural",
name="admin_1_states_provinces_lines",
scale="50m",
facecolor="none",
edgecolor="k",
)# Choose the timestep here. Timestep 30 shows clean air before the smoke arrived.
timestep01 = 30
# Get value range for the single timestep
values_min = 0
values_max = 10
# Get the data arrays for the selected timestep
geo_mask = (
get_geo_mask(mfds["Longitude"][timestep01, :, :], mfds["Latitude"][timestep01, :, :])
.compute()
.values
)
lon_values = mfds["Longitude"][timestep01, :, :].where(geo_mask).values
lat_values = mfds["Latitude"][timestep01, :, :].where(geo_mask).values
var_values = mfds[data_variable][timestep01, :, :].where(geo_mask).values
timestamp = datetime.strptime(
np.datetime_as_string(mfds["time"][timestep01].values, unit="ms"), "%Y-%m-%dT%H:%M:%S.%f"
)
# Encoding the actual Plot
my_projection = ccrs.PlateCarree()
fig, ax = plt.subplots(
figsize=(12, 9), subplot_kw={"projection": my_projection}
) # , "extent": extent})
vmin, vmax = values_min, values_max
levels = 50
level_boundaries = np.linspace(vmin, vmax, levels + 1)
_contourf = ax.contourf(
lon_values,
lat_values,
var_values,
cmap=muted_cmap,
transform=ccrs.PlateCarree(),
levels=level_boundaries,
vmin=vmin,
vmax=vmax,
extend="both",
)
ax.add_feature(cfeature.STATES, linestyle="--", linewidth=1, edgecolor="w")
cb_handle = plt.colorbar(
_contourf,
orientation="horizontal",
ticks=range(int(np.floor(vmin)), int(np.ceil(vmax + 1)), 1),
boundaries=level_boundaries,
values=(level_boundaries[:-1] + level_boundaries[1:]) / 2,
ax=ax,
)
cb_handle.ax.set_title(data_variable, fontsize=18)
ax.set_title("%s" % (timestamp.strftime("%Y%m%d %H:%M:%S")), fontsize=18)
gl_handle = ax.gridlines(
crs=ccrs.PlateCarree(),
draw_labels=["left", "bottom"],
linewidth=0.8,
color="gray",
alpha=0.5,
linestyle=":",
)
# We change the fontsize tick labels
ax.tick_params(axis="both", which="major", labelsize=16)
ax.tick_params(axis="both", which="minor", labelsize=16)
cb_handle.ax.tick_params(axis="both", which="minor", labelsize=16)
gl_handle.xlabel_style, gl_handle.ylabel_style = {"fontsize": 16}, {"fontsize": 16}
plt.savefig("UVAerosolIndex/June05", bbox_inches="tight")
plt.show()# Choose the timestep here. Timestep 48 shows dangerous levels of air quality as the smoke moved into the area.
timestep02 = 48
# Get value range for the single timestep
values_min = 0
values_max = 10
# Get the data arrays for the selected timestep
geo_mask = (
get_geo_mask(mfds["Longitude"][timestep02, :, :], mfds["Latitude"][timestep02, :, :])
.compute()
.values
)
lon_values = mfds["Longitude"][timestep02, :, :].where(geo_mask).values
lat_values = mfds["Latitude"][timestep02, :, :].where(geo_mask).values
var_values = mfds[data_variable][timestep02, :, :].where(geo_mask).values
timestamp = datetime.strptime(
np.datetime_as_string(mfds["time"][timestep02].values, unit="ms"), "%Y-%m-%dT%H:%M:%S.%f"
)
# Encoding the actual Plot
my_projection = ccrs.PlateCarree()
fig, ax = plt.subplots(
figsize=(12, 9), subplot_kw={"projection": my_projection}
) # , "extent": extent})
vmin, vmax = values_min, values_max
levels = 50
level_boundaries = np.linspace(vmin, vmax, levels + 1)
_contourf = ax.contourf(
lon_values,
lat_values,
var_values,
cmap=muted_cmap,
transform=ccrs.PlateCarree(),
levels=level_boundaries,
vmin=vmin,
vmax=vmax,
extend="both",
)
ax.add_feature(cfeature.STATES, linestyle="--", linewidth=1, edgecolor="w")
cb_handle = plt.colorbar(
_contourf,
orientation="horizontal",
ticks=range(int(np.floor(vmin)), int(np.ceil(vmax + 1)), 1),
boundaries=level_boundaries,
values=(level_boundaries[:-1] + level_boundaries[1:]) / 2,
ax=ax,
)
cb_handle.ax.set_title(data_variable, fontsize=18)
ax.set_title("%s" % (timestamp.strftime("%Y%m%d %H:%M:%S")), fontsize=18)
gl_handle = ax.gridlines(
crs=ccrs.PlateCarree(),
draw_labels=["left", "bottom"],
linewidth=0.8,
color="gray",
alpha=0.5,
linestyle=":",
)
# We change the fontsize tick labels
ax.tick_params(axis="both", which="major", labelsize=16)
ax.tick_params(axis="both", which="minor", labelsize=16)
cb_handle.ax.tick_params(axis="both", which="minor", labelsize=16)
gl_handle.xlabel_style, gl_handle.ylabel_style = {"fontsize": 16}, {"fontsize": 16}
# plt.savefig("UVAerosolIndex/June06", bbox_inches="tight")
plt.show()8. Change Data Variable, Create graphs for Final Aerosol Layer height
# Declare the desired Data Variable
data_variable = "FinalAerosolLayerHeight"geo_mask = (
get_geo_mask(mfds["Longitude"][timestep01, :, :], mfds["Latitude"][timestep01, :, :])
.compute()
.values
)
lon_values = mfds["Longitude"][timestep01, :, :].where(geo_mask).values
lat_values = mfds["Latitude"][timestep01, :, :].where(geo_mask).values
var_values = mfds[data_variable][timestep01, :, :].where(geo_mask).values# Choose the timestep here. Timestep 30 shows clean air before the smoke arrived.
timestep01 = 48
# Get value range for the single timestep
values_min = 0
values_max = 10
# Get the data arrays for the selected timestep
geo_mask = (
get_geo_mask(mfds["Longitude"][timestep01, :, :], mfds["Latitude"][timestep01, :, :])
.compute()
.values
)
lon_values = mfds["Longitude"][timestep01, :, :].where(geo_mask).values
lat_values = mfds["Latitude"][timestep01, :, :].where(geo_mask).values
var_values = mfds[data_variable][timestep01, :, :].where(geo_mask).values
timestamp = datetime.strptime(
np.datetime_as_string(mfds["time"][timestep01].values, unit="ms"), "%Y-%m-%dT%H:%M:%S.%f"
)
# Encoding the actual Plot
my_projection = ccrs.PlateCarree()
fig, ax = plt.subplots(
figsize=(12, 9), subplot_kw={"projection": my_projection}
) # , "extent": extent})
vmin, vmax = values_min, values_max
levels = 50
level_boundaries = np.linspace(vmin, vmax, levels + 1)
_contourf = ax.contourf(
lon_values,
lat_values,
var_values,
cmap=muted_cmap,
transform=ccrs.PlateCarree(),
levels=level_boundaries,
vmin=vmin,
vmax=vmax,
extend="both",
)
ax.add_feature(cfeature.STATES, linestyle="--", linewidth=1, edgecolor="w")
cb_handle = plt.colorbar(
_contourf,
orientation="horizontal",
ticks=range(int(np.floor(vmin)), int(np.ceil(vmax + 1)), 1),
boundaries=level_boundaries,
values=(level_boundaries[:-1] + level_boundaries[1:]) / 2,
ax=ax,
)
cb_handle.ax.set_title(data_variable, fontsize=18)
ax.set_title("%s" % (timestamp.strftime("%Y%m%d %H:%M:%S")), fontsize=18)
gl_handle = ax.gridlines(
crs=ccrs.PlateCarree(),
draw_labels=["left", "bottom"],
linewidth=0.8,
color="gray",
alpha=0.5,
linestyle=":",
)
# We change the fontsize tick labels
ax.tick_params(axis="both", which="major", labelsize=16)
ax.tick_params(axis="both", which="minor", labelsize=16)
cb_handle.ax.tick_params(axis="both", which="minor", labelsize=16)
gl_handle.xlabel_style, gl_handle.ylabel_style = {"fontsize": 16}, {"fontsize": 16}
plt.savefig("FinalAerosolLayerHeight/June06", bbox_inches="tight")
plt.show()# Choose the timestep here. Timestep 30 shows clean air before the smoke arrived.
timestep01 = 99
# Get value range for the single timestep
values_min = 0
values_max = 10
# Get the data arrays for the selected timestep
geo_mask = (
get_geo_mask(mfds["Longitude"][timestep01, :, :], mfds["Latitude"][timestep01, :, :])
.compute()
.values
)
lon_values = mfds["Longitude"][timestep01, :, :].where(geo_mask).values
lat_values = mfds["Latitude"][timestep01, :, :].where(geo_mask).values
var_values = mfds[data_variable][timestep01, :, :].where(geo_mask).values
timestamp = datetime.strptime(
np.datetime_as_string(mfds["time"][timestep01].values, unit="ms"), "%Y-%m-%dT%H:%M:%S.%f"
)
# Encoding the actual Plot
my_projection = ccrs.PlateCarree()
fig, ax = plt.subplots(
figsize=(12, 9), subplot_kw={"projection": my_projection}
) # , "extent": extent})
vmin, vmax = values_min, values_max
levels = 50
level_boundaries = np.linspace(vmin, vmax, levels + 1)
_contourf = ax.contourf(
lon_values,
lat_values,
var_values,
cmap=muted_cmap,
transform=ccrs.PlateCarree(),
levels=level_boundaries,
vmin=vmin,
vmax=vmax,
extend="both",
)
ax.add_feature(cfeature.STATES, linestyle="--", linewidth=1, edgecolor="w")
cb_handle = plt.colorbar(
_contourf,
orientation="horizontal",
ticks=range(int(np.floor(vmin)), int(np.ceil(vmax + 1)), 1),
boundaries=level_boundaries,
values=(level_boundaries[:-1] + level_boundaries[1:]) / 2,
ax=ax,
)
cb_handle.ax.set_title(data_variable, fontsize=18)
ax.set_title("%s" % (timestamp.strftime("%Y%m%d %H:%M:%S")), fontsize=18)
gl_handle = ax.gridlines(
crs=ccrs.PlateCarree(),
draw_labels=["left", "bottom"],
linewidth=0.8,
color="gray",
alpha=0.5,
linestyle=":",
)
# We change the fontsize tick labels
ax.tick_params(axis="both", which="major", labelsize=16)
ax.tick_params(axis="both", which="minor", labelsize=16)
cb_handle.ax.tick_params(axis="both", which="minor", labelsize=16)
gl_handle.xlabel_style, gl_handle.ylabel_style = {"fontsize": 16}, {"fontsize": 16}
# plt.savefig("FinalAerosolLayerHeight/June10", bbox_inches="tight")
plt.show()Section 2: AQS and EPIC Comparisons
1. Generate Times and n_times, Create Getter Function for Hampton Roads Data
# Declare the desired Data Variable
data_variable = "UVAerosolIndex"times = [
datetime.strptime(np.datetime_as_string(t.values, unit="ms"), "%Y-%m-%dT%H:%M:%S.%f")
for t in mfds["time"]
]
n_time = len(mfds.time)def get_hampton_value(a_timestep: int, query_point_lon_lat: tuple):
lon_values, lat_values, var_values, timestamp = get_data_arrays(a_timestep)
lon_flat, lat_flat = lon_values.flatten(), lat_values.flatten()
new2d = np.zeros((len(lon_flat), 2))
for idx, (a, b) in enumerate(zip(lon_flat, lat_flat)):
if not np.isnan(a):
new2d[idx, :] = [a, b]
nearestpt_distance, nearestpt_index = spatial.KDTree(new2d).query(query_point_lon_lat)
nearestpt_coords = new2d[nearestpt_index, :]
return nearestpt_distance, nearestpt_index, nearestpt_coords!!! Warning !!! This next cell can take a long time (e.g., +20 minutes). It needs to be run once, but then should be commented-out.
# Retrieve the Hampton values
# 36.9339° N, 76.3637° W
pt_hampton = [-76.3637, 37]
nearestpt_distances = []
nearestpt_indices = []
nearestpt_coords = []
for i in tqdm(range(n_time)):
nearest_distance, nearest_index, nearest_coords = get_hampton_value(i, pt_hampton)
nearestpt_distances.append(nearest_distance)
nearestpt_indices.append(nearest_index)
nearestpt_coords.append(nearest_coords)# Use MinMax Scores to remove Outliers
timeseries_values = []
times = []
# calculate the outliers
max_x = max(nearestpt_distances)
min_x = min(nearestpt_distances)
for i in tqdm(range(len(nearestpt_distances))):
x = nearestpt_distances[i]
xscore = (x - min_x) / (max_x - min_x)
_, _, var_values, timestamp = get_data_arrays(i)
if xscore < 0.70:
nearestpt_var_value = var_values.flatten()[nearestpt_indices[i]]
if nearestpt_var_value >= 0:
timeseries_values.append(nearestpt_var_value)
times.append(timestamp)
n_time = len(timeseries_values)
ntime = len(nearestpt_distances)
print("Original Number: ", ntime, "\nWithoutOutliers ", n_time)2. Use RSIG to get EPA Air Quality data from the same time period
hampton_roads_rsigapi = pyrsig.RsigApi(
bdate="2023-06-02 00",
edate="2023-06-16 23:59:59",
bbox=(-77, 36, -75, 38), # Hampton Roads min lon, min lat, max lon, max lat
)
hampton_roads_aqsno2df = hampton_roads_rsigapi.to_dataframe(
"aqs.pm25", parse_dates=True, unit_keys=False
)
hr_aqs_time = hampton_roads_aqsno2df.groupby(["time"]).median(numeric_only=True)["pm25"]
hr_aqs_latitude = hampton_roads_aqsno2df["LATITUDE"]
hr_aqs_longitude = hampton_roads_aqsno2df["LONGITUDE"]3. Make combined figure
color_green = (17 / 256, 119 / 256, 51 / 256) # Green
color_purple = (204 / 256, 102 / 256, 119 / 256) # Rose
fig, axs = plt.subplots(figsize=(10, 12), nrows=2, ncols=1, sharex=True)
# Plot
axn = 0
axs[axn].plot(
hr_aqs_time.index.values, hr_aqs_time.values, markersize=9, linewidth=4, color=color_purple
)
axs[axn].set_ylabel("Hampton, VA\n(AQS pm25)", fontsize=18, color="k")
axn = 1
axs[axn].plot(times, timeseries_values, ":x", markersize=7, color=color_green)
axs[axn].set_ylabel(
f"{data_variable} @Hampton, VA\n(DSCOVR_EPIC_L2_AER)", fontsize=18, color="black"
)
# Change the aesthetics
for a in axs:
a.axvspan(
datetime(2023, 6, 6, 0, 0, 0), datetime(2023, 6, 10, 0, 0, 0), alpha=0.3, color="pink"
)
a.axhline(0, linestyle=":", alpha=0.3, color="gray")
a.tick_params(axis="both", which="major", labelsize=16)
a.tick_params(axis="both", which="minor", labelsize=16)
a.grid(visible=True, which="major", axis="both", color="lightgray", linestyle=":")
a.tick_params(color="gray", labelcolor="black")
for spine in a.spines.values():
spine.set_edgecolor("gray")
plt.xticks(rotation=45)
plt.tight_layout()
# plt.savefig("DscovrFigures/01_dscovr_epic_uvai_with_AQS_timeseries.png", dpi=200, bbox_inches="tight")
plt.show()4. Parsing the AQS Data to minimize time difference between points
from pandas import to_datetime
from cartopy.io.shapereader import Reader
import math
import statistics
from scipy import stats
from sklearn.metrics import mean_squared_error
def get_normalized_data(data):
max_pt = max(data)
min_pt = min(data)
for i in range(len(data)):
x = data[i]
data[i] = (x - min_pt) / (max_pt - min_pt)
return data
def get_date_splits(data, times):
before = []
during = []
after = []
day1 = datetime(2023, 6, 6, 0, 0, 0)
day2 = datetime(2023, 6, 10, 0, 0, 0)
for i in range(len(times)):
curr_day = times[i]
if curr_day < day1:
before.append(data[i])
elif curr_day < day2:
during.append(data[i])
else:
after.append(data[i])
return before, during, after# Because there are more points from the EPA I am going to remove some
aqs_Times = []
aqs_Values = []
aqs_Lats = []
aqs_Longs = []
aqs_Count = 12 # Here the EPA data catches up to the DSCOVR Data
current = to_datetime(hr_aqs_time.index.values[aqs_Count])
total_error = 0
for i in range(len(times)):
x = times[i]
while current < x:
aqs_Count += 1
current = to_datetime(hr_aqs_time.index.values[aqs_Count])
prev = to_datetime(hr_aqs_time.index.values[aqs_Count] - 1)
prev_difference = x - prev
current_difference = current - x
if current_difference < prev_difference:
aqs_Times.append(to_datetime(hr_aqs_time.index.values[aqs_Count]))
aqs_Values.append(hr_aqs_time.values[aqs_Count])
aqs_Lats.append(hr_aqs_latitude.values[aqs_Count])
aqs_Longs.append(hr_aqs_longitude.values[aqs_Count])
else:
aqs_Times.append(to_datetime(hr_aqs_time.index.values[aqs_Count - 1]))
aqs_Values.append(hr_aqs_time.values[aqs_Count - 1])
aqs_Lats.append(hr_aqs_latitude.values[aqs_Count - 1])
aqs_Longs.append(hr_aqs_longitude.values[aqs_Count - 1])normalized_dscovr = get_normalized_data(timeseries_values)
normalized_aqs = get_normalized_data(aqs_Values)fig, ax = plt.subplots(figsize=(12, 9))
ax.plot(times, normalized_dscovr, ":x", markersize=7, color=color_green, label="DSCOVR EPIC")
ax.plot(aqs_Times, normalized_aqs, markersize=9, linewidth=4, color=color_purple, label="AQS")
ax.legend()
ax.set_title(f"{data_variable} @Hampton Roads, VA\n Normalized DSCOVR EPIC and AQS", fontsize=18)
# Detailing the axis labels
ax.tick_params(axis="both", which="major", labelsize=16)
ax.tick_params(axis="both", which="minor", labelsize=16)
plt.xticks(rotation=45)
# plt.savefig("DscovrFigures/02_UVAI_Initial_Comparison.png", dpi=200, bbox_inches="tight")
plt.show()5. RSIG and DSCOVR EDA
# splitting
dscovr_before, dscovr_during, dscovr_after = get_date_splits(normalized_dscovr, times)
aqs_before, aqs_during, aqs_after = get_date_splits(normalized_aqs, aqs_Times)
# calculating RSME
total_rsme = math.sqrt(mean_squared_error(normalized_aqs, normalized_dscovr))
before_rsme = math.sqrt(mean_squared_error(aqs_before, dscovr_before))
during_rsme = math.sqrt(mean_squared_error(aqs_during, dscovr_during))
after_rsme = math.sqrt(mean_squared_error(aqs_after, dscovr_after))
print("RSME Total:\t\t\t", total_rsme)
print("RSME Before:\t\t\t", before_rsme)
print("RSME During:\t\t\t", during_rsme)
print("RSME After:\t\t\t", after_rsme)
# Calculating Variance
dscovr_xbar = sum(normalized_dscovr) / len(normalized_dscovr)
var_dscovr = statistics.variance(normalized_dscovr, dscovr_xbar)
print("\nDSCOVR Normalized Variance:\t", var_dscovr)
print("Variance Before:\t\t", statistics.variance(dscovr_before, dscovr_xbar))
print("Variance During:\t\t", statistics.variance(dscovr_during, dscovr_xbar))
print("Variance After:\t\t\t", statistics.variance(dscovr_after, dscovr_xbar))
aqs_xbar = sum(normalized_aqs) / len(normalized_aqs)
var_aqs = statistics.variance(normalized_aqs, aqs_xbar)
print("\nEPA Normalized Variance:\t", var_aqs)
print("Variance Before:\t\t", statistics.variance(aqs_before, aqs_xbar))
print("Variance During:\t\t", statistics.variance(aqs_during, aqs_xbar))
print("Variance After:\t\t\t", statistics.variance(aqs_after, aqs_xbar))Is Location the Issue?
# Separating the Coordinates into Latitude and Longitude Arrays for ease
dscovr_lats = []
dscovr_longs = []
for val in nearestpt_coords:
dscovr_lats.append(val[1])
dscovr_longs.append(val[0])
# Building the Projection
my_projection = ccrs.PlateCarree()
fig, ax = plt.subplots(figsize=(12, 9), subplot_kw={"projection": my_projection})
ax.set_facecolor("lightblue")
# Calling the County level data to build the outline
county_shapefile = "content/cartopy_data/cultural/ne_10m_admin_2_counties.shp"
counties = cfeature.ShapelyFeature(
Reader(county_shapefile).geometries(), my_projection, facecolor="white", edgecolor="k"
)
ax.add_feature(counties, linestyle=":", zorder=0)
ax.set_extent([-76.7, -76, 36.8, 37.2], crs=my_projection)
ax.add_feature(cfeature.STATES, linestyle="-", linewidth=0.8, edgecolor="k", zorder=1)
# Putting the actual data onto the graph
ax.scatter(aqs_Longs, aqs_Lats, marker="x", s=150, color=color_purple, zorder=2, label="RSIG Data")
ax.scatter(
dscovr_longs,
dscovr_lats,
marker="o",
s=30,
data=timeseries_values,
c=np.arange(len(dscovr_longs)),
cmap=muted_cmap,
zorder=3,
label="DSCOVR Data",
)
ax.legend()
# Adding the colorbar
cb_handle = plt.colorbar(
_contourf,
orientation="horizontal",
ticks=range(int(np.floor(vmin)), int(np.ceil(vmax + 1)), 1),
boundaries=level_boundaries,
values=(level_boundaries[:-1] + level_boundaries[1:]) / 2,
ax=ax,
)
cb_handle.ax.set_title(data_variable, fontsize=16)
ax.set_title("All Data Point Locations in Hampton Roads", fontsize=18)
gl_handle = ax.gridlines(
crs=ccrs.PlateCarree(),
draw_labels=["left", "bottom"],
linewidth=0.8,
color="gray",
alpha=0.5,
linestyle=":",
)
# Detailing the axis labels
ax.tick_params(axis="both", which="major", labelsize=16)
ax.tick_params(axis="both", which="minor", labelsize=16)
cb_handle.ax.tick_params(axis="both", which="minor", labelsize=16)
gl_handle.xlabel_style, gl_handle.ylabel_style = {"fontsize": 16}, {"fontsize": 16}
# plt.savefig("DscovrFigures/03_Locations_Graph", bbox_inches="tight")
plt.show()slope, intercept, r_val, p_val, std_err = stats.linregress(normalized_aqs, normalized_dscovr)
print("Slope:\t\t", slope)
print("Intercept:\t", intercept)
print("R Value:\t", r_val)
print("P Value:\t", p_val)
print("Standard Error:\t", std_err)
plt.scatter(normalized_aqs, normalized_dscovr, color=color_green)
plt.plot(normalized_aqs, slope * np.array(normalized_aqs) + intercept, color=color_purple)
plt.title("AQS and DSCOVR EPIC Linear Regression")
plt.xlabel("Normalized AQS Data")
plt.ylabel("Normalized DSCOVR EPIC Data")
# plt.savefig("DscovrFigures/04_Initial_Linear_Regression", bbox_inches="tight")
plt.show()6. Minimizing Error by shifting AQS time
best_timeshift_dscovr = []
best_timeshift_aqs = []
min_rsme = 1
best_dilation = 0
for i in range(int(len(normalized_aqs) / 3)):
timeshift_normalized_aqs = normalized_aqs[i : len(normalized_aqs)]
timeshift_normalized_dscovr = normalized_dscovr[0 : len(normalized_dscovr) - i]
curr_rsme = math.sqrt(mean_squared_error(timeshift_normalized_aqs, timeshift_normalized_dscovr))
if curr_rsme < min_rsme:
min_rsme = curr_rsme
best_dilation = i
best_timeshift_dscovr = timeshift_normalized_dscovr
best_timeshift_aqs = timeshift_normalized_aqsfig, ax = plt.subplots(figsize=(12, 9))
ax.plot(
times[0 : len(times) - best_dilation],
best_timeshift_dscovr,
":x",
markersize=7,
color=color_green,
label="DSCOVR EPIC",
)
ax.plot(
times[0 : len(times) - best_dilation],
best_timeshift_aqs,
markersize=9,
linewidth=4,
color=color_purple,
label="AQS",
)
ax.legend()
ax.set_title(
f"{data_variable} @Hampton Roads, VA\n Normalized DSCOVR EPIC and Timeshifted AQS", fontsize=18
)
# Detailing the axis labels
ax.tick_params(axis="both", which="major", labelsize=16)
ax.tick_params(axis="both", which="minor", labelsize=16)
plt.xticks(rotation=45)
# plt.savefig("DscovrFigures/05_UVAI_Timeshift_Comparison.png", dpi=200, bbox_inches="tight")
plt.show()dscovr_before, dscovr_during, dscovr_after = get_date_splits(
best_timeshift_dscovr, times[0 : len(times) - best_dilation]
)
aqs_before, aqs_during, aqs_after = get_date_splits(
normalized_aqs, times[0 : len(times) - best_dilation]
)
# calculating RSME
total_rsme = math.sqrt(mean_squared_error(best_timeshift_aqs, best_timeshift_dscovr))
before_rsme = math.sqrt(mean_squared_error(aqs_before, dscovr_before))
during_rsme = math.sqrt(mean_squared_error(aqs_during, dscovr_during))
after_rsme = math.sqrt(mean_squared_error(aqs_after, dscovr_after))
print("RSME Total:\t\t\t", total_rsme)
print("RSME Before:\t\t\t", before_rsme)
print("RSME During:\t\t\t", during_rsme)
print("RSME After:\t\t\t", after_rsme)
# Calculating Variance
timeshift_dscovr_xbar = sum(best_timeshift_dscovr) / len(best_timeshift_dscovr)
timeshift_var_dscovr = statistics.variance(best_timeshift_dscovr, timeshift_dscovr_xbar)
print("\nDSCOVR Normalized Variance:\t", timeshift_var_dscovr)
print("Variance Before:\t\t", statistics.variance(dscovr_before, timeshift_dscovr_xbar))
print("Variance During:\t\t", statistics.variance(dscovr_during, timeshift_dscovr_xbar))
print("Variance After:\t\t\t", statistics.variance(dscovr_after, timeshift_dscovr_xbar))
timeshift_aqs_xbar = sum(best_timeshift_aqs) / len(best_timeshift_aqs)
timeshift_var_aqs = statistics.variance(best_timeshift_aqs, timeshift_aqs_xbar)
print("\nAQS Normalized Variance:\t", timeshift_var_aqs)
print("Variance Before:\t\t", statistics.variance(aqs_before, timeshift_aqs_xbar))
print("Variance During:\t\t", statistics.variance(aqs_during, timeshift_aqs_xbar))
print("Variance After:\t\t\t", statistics.variance(aqs_after, timeshift_aqs_xbar))slope, intercept, r_val, p_val, std_err = stats.linregress(
best_timeshift_aqs, best_timeshift_dscovr
)
print("Slope:\t\t", slope)
print("Intercept:\t", intercept)
print("R Value:\t", r_val)
print("P Value:\t", p_val)
print("Standard Error:\t", std_err)
plt.scatter(best_timeshift_aqs, best_timeshift_dscovr, color=color_green)
plt.plot(normalized_aqs, slope * np.array(normalized_aqs) + intercept, color=color_purple)
plt.title("DSCOVR EPIC and Timeshift AQS Linear Regression")
plt.xlabel("Normalized Timeshifted AQS Data")
plt.ylabel("Normalized DSCOVR EPIC Data")
# plt.savefig("DscovrFigures/06_Timeshift_Linear_Regression", bbox_inches="tight")
plt.show()END of Notebook.