TempEst-NEXT Real-Time National Forecasting¶

import NEXT
import NEXT.wforecast as wfc
import os
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd
import shapely as shp
from pynhd import WaterData, NLDI
from time import sleep
import warnings
import numpy as np
import datetime

Prepare Environment¶

dirs = ["hist_data/", "hrrr_data/", "final_hist/", "gfs_raw/", "forecasts/", 'figures/', 'full_forecasts/']
for d in dirs:
    if not os.path.exists(d):
        os.mkdir(d)

Identify Sites¶

By default, the forecast runs for 1,000 HUC-12 pour points sampled from across the contiguous US. You can modify this by changing the site selection below and removing the cached site file. The sampling procedure is to select 1,000 different HUC-8s, then sample one HUC-12 from each. (The actual default dataset contains 913 sites after misc errors.)

sitefile = "forecast_points.json"
run_sites = not os.path.exists(sitefile)

def try_and_sleep(f):
    sleep(5)
    try:
        return f()
    except:
        return None

# Wait between calls because otherwise it tends to overload the API
if run_sites:
    huc6 = WaterData("wbd06")
    huc12 = WaterData("wbd12")
    all_huc6 = huc6.bybox((-125, 25, -65, 50))
    huc12_samp = pd.concat([
        try_and_sleep(lambda: huc12.bygeom(x.geometry).sample(n=50)) # get extra samples because some will crash
        for x in all_huc6.itertuples()
    ]).sample(n=1200)
    pps = NLDI().getfeature_byid("huc12pp", huc12_samp["huc12"])
    geodata = (huc12_samp.loc[:, ["geometry", "huc12", "areasqkm"]].
        rename(columns={"huc12": "id", "areasqkm": "area"}).
        merge(
            pd.DataFrame({
                "id": pps["identifier"],
                "lat": pps.geometry.y,
                "lon": pps.geometry.x
            }),
            on="id"
        )).dropna().sample(1000)
    geodata.loc[:, "area"] = geodata["area"] * 1e6 # km2 -> m2
    geodata.to_file(sitefile)

else:
    geodata = gpd.read_file(sitefile)

geodata.plot()

<Axes: >

Retrieve Historical Data¶

Humidity is retrieved from GridMet. Air temperature is from HRRR so that climatology will be more similar to GFS, for which NEXT doesn’t have long-term archive retrieval (just recent forecast runs).

The below loops may crash periodically due to server errors. If so, just restart them, as they are set up to resume where they left off. GridMet retrieval takes several hours, and HRRR retrieval is currently exceptionally slow for no obvious reason, on the order of 1 hour per site. Due to that challenge, HRRR is only retrieved for one year for a sample of sites, which is used to fit a correction function.

runhist = len(os.listdir(dirs[2])) == 0

basepath = dirs[0] + "/"
if runhist:
    with warnings.catch_warnings(action="ignore"):
        for row in geodata.itertuples():
            fn = basepath + row.id + ".csv"
            if not os.path.exists(fn):
                # note: site type "huc12" doesn't actually exist. It will internally convert to coordinates.
                try:
                    NEXT.data.geom_full_data(row.id, "huc12", gpd.GeoSeries([row.geometry], crs=4326), row.lat, row.lon, row.area,
                                             "2015", "2025", "gridmet").to_csv(fn, index=False)
                except:
                    print(row.id)

The following block can be cancelled after several dozen sites have been retrieved.

if runhist:
    allids = [x.split('.')[0] for x in os.listdir(dirs[0])]  # don't run HRRR if we don't have other info
    basepath = dirs[1] + '/'
    with warnings.catch_warnings(action="ignore"):
        for row in geodata.itertuples():
            fn = basepath + row.id + ".csv"
            if (not os.path.exists(fn)) and (row.id in allids):
                NEXT.data.weather_hrrr(gpd.GeoSeries([row.geometry], crs=4326), "2025-01-01", "2025-12-31").to_csv(fn, index=False)

Compute corrections:

allids = [x.split('.')[0] for x in os.listdir(dirs[1])]
combined = pd.concat([
    pd.read_csv(dirs[0] + "/" + row.id + ".csv", parse_dates=["date"]).
        merge(pd.read_csv(dirs[1] + "/" + row.id + ".csv", parse_dates=["date"]),
              on="date",
              suffixes=["_gm", "_hrrr"])
    for row in geodata.itertuples()
    if row.id in allids
])

combined.plot.scatter(x="tmax_gm", y="tmax_hrrr")

<Axes: xlabel='tmax_gm', ylabel='tmax_hrrr'>

combined["tmax_gm"].corr(combined["tmax_hrrr"])

np.float64(0.9543465835168514)

(combined.groupby("id")[["tmax_hrrr", "tmax_gm"]].corr().
    loc[(slice(None), "tmax_gm"), "tmax_hrrr"].describe())

count    78.000000
mean      0.945189
std       0.037704
min       0.762891
25%       0.935038
50%       0.956855
75%       0.967331
max       0.978917
Name: tmax_hrrr, dtype: float64

The relationship is imperfect, but we observe that sitewise correlation is nearly identical to global correlation, so a global correction can be applied.

fit = np.linalg.lstsq(
    np.array([np.ones(len(combined)), combined["tmax_gm"]]).T,
    combined["tmax_hrrr"])[0]

if runhist:
    for fn in os.listdir(dirs[0]):
        if fn.endswith('.csv'):
            data = pd.read_csv(dirs[0] + "/" + fn, parse_dates=["date"])
            data["tmax"] = data["tmax"] * fit[1] + fit[0]
            data.to_csv(dirs[2] + '/' + fn, index=False)

Retrieve Forecast Data and Execute Forecasts¶

model = NEXT.NEXT.from_pickle('coefs_hrrr.pickle')

allids = [x.split('.')[0] for x in os.listdir(dirs[2])]
torun = geodata.loc[geodata['id'].isin(allids)]
earlier = (datetime.date.today() - datetime.timedelta(6)).strftime("%Y%m%d")
today = datetime.date.today().strftime("%Y%m%d")
with warnings.catch_warnings(action='ignore'):
    fcst_weath = (pd.concat([wfc.get_gfs_downloaded(torun, earlier, dirs[3]),
                             wfc.get_gfs_downloaded(torun, today, dirs[3])]).
        groupby(["id", "date"], as_index=False)["tmax"].last().sort_values("date"))
    for row in torun.itertuples():
        infn = dirs[2] + row.id + '.csv'
        outfn = dirs[4] + row.id + '_' + today + '.csv'
        if not os.path.exists(outfn):
            try:
                model.make_newt(pd.read_csv(infn, parse_dates=['date']), reset=True)
                model.run(fcst_weath.loc[fcst_weath['id'] == row.id]).to_csv(outfn, index=False)
            except KeyboardInterrupt:
                break
            except:
                print(row.id)

080901000101

Analyze The Forecast¶

today = datetime.date.today().strftime("%Y%m%d")
forecast = pd.concat([
    pd.read_csv(dirs[4] + fn, parse_dates=['date']).
        assign(id = fn.split('_')[0])
    for fn in os.listdir(dirs[4])
    if today in fn and fn.endswith('.csv')
])
# Limit to weekly forecast
forecast = forecast.loc[(forecast['date'] >= np.datetime64('today')) &
    (forecast['date'] < np.datetime64('today') + 7)]
forecast = forecast.merge(geodata, on='id').dropna()
forecast.drop(columns=['day', 'period', 'geometry']).to_csv(dirs[6] + today + '.csv', index=False)
forecast

	id	date	tmax	day	period	prediction	seasonal	anomaly	area	lat	lon	geometry
0	010900050201	2026-04-13	15.362663	103	103	8.945897	10.911749	-1.965852	66570000.0	41.442098	-71.590871	MULTIPOLYGON (((-71.51761 41.55376, -71.51733 ...
1	010900050201	2026-04-14	21.079019	104	104	9.638248	11.044416	-1.406168	66570000.0	41.442098	-71.590871	MULTIPOLYGON (((-71.51761 41.55376, -71.51733 ...
2	010900050201	2026-04-15	23.610065	105	105	10.816572	11.177676	-0.361104	66570000.0	41.442098	-71.590871	MULTIPOLYGON (((-71.51761 41.55376, -71.51733 ...
3	010900050201	2026-04-16	21.855503	106	106	11.754032	11.311491	0.442541	66570000.0	41.442098	-71.590871	MULTIPOLYGON (((-71.51761 41.55376, -71.51733 ...
4	010900050201	2026-04-17	23.611570	107	107	11.973921	11.445821	0.528099	66570000.0	41.442098	-71.590871	MULTIPOLYGON (((-71.51761 41.55376, -71.51733 ...
...	...	...	...	...	...	...	...	...	...	...	...	...
6449	030601020108	2026-04-15	28.654725	105	105	9.599104	9.068038	0.531067	68530000.0	34.716343	-83.354611	MULTIPOLYGON (((-83.39605 34.78676, -83.39639 ...
6450	030601020108	2026-04-16	27.555318	106	106	11.424976	8.912083	2.512893	68530000.0	34.716343	-83.354611	MULTIPOLYGON (((-83.39605 34.78676, -83.39639 ...
6451	030601020108	2026-04-17	29.583026	107	107	11.641974	8.755528	2.886446	68530000.0	34.716343	-83.354611	MULTIPOLYGON (((-83.39605 34.78676, -83.39639 ...
6452	030601020108	2026-04-18	28.607069	108	108	11.782561	8.598419	3.184142	68530000.0	34.716343	-83.354611	MULTIPOLYGON (((-83.39605 34.78676, -83.39639 ...
6453	030601020108	2026-04-19	24.017823	109	109	10.909466	8.440803	2.468663	68530000.0	34.716343	-83.354611	MULTIPOLYGON (((-83.39605 34.78676, -83.39639 ...

6454 rows × 12 columns

# Export to mappable GeoJSON. Modify this code to set custom color scales. These column
# labels work with CalTopo.
fcjs = forecast.groupby(['lat', 'lon', 'id'])['prediction'].agg(['mean', 'min', 'max'])
fcjs = gpd.GeoDataFrame(fcjs,
                        geometry = [
                            shp.Point(x.lon, x.lat)
                            for x in fcjs.reset_index().itertuples()
                        ],
                        crs=4326)
fcjs['marker-size'] = 2
fcjs['title'] = fcjs.index.get_level_values('id')
fcjs['marker-color'] = "0000FF"
fcjs.loc[fcjs['mean'] > 10, 'marker-color'] = "008080"
fcjs.loc[fcjs['mean'] > 13, 'marker-color'] = "00FF00"
fcjs.loc[fcjs['mean'] > 16, 'marker-color'] = "808000"
fcjs.loc[fcjs['mean'] > 20, 'marker-color'] = "FF0000"
fcjs.to_file(dirs[6] + today + '_map.json')

Forecast Description¶

Warning: this is an experimental stream water temperature forecast for the upcoming week. Results are not guaranteed in any capacity. Typical errors are approximately 2.5 C (4 F); any given prediction may have considerably higher errors. For example, in the southern Rocky Mountains during the first forecast run (April 2026), the model, while it did predict high temperature anomalies, was unable to fully account for the unseasonal loss of snowpack and therefore underpredicted temperatures by several degrees.

These data and related items of information have not been formally disseminated by NOAA, and do not represent any agency determination, view, or policy. This research is funded by the NOAA Cooperative Institute for Research to Operations in Hydrology.

The code used to generate these forecasts is available on GitHub at https://github.com/mines-ciroh/natl-temp-forecast (requires support for ecCodes, for which Linux is recommended). Additionally, required preliminary data for the 913 default forecast sites are available on the GitHub Releases page, so the only data retrieval required is the GFS forecast download. Actual weekly forecasts are made available on CUAHSI HydroShare at http://www.hydroshare.org/resource/b8852529788a437a8d697e9b0435b99a.

The US states shapefile was retrieved from https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.html, and the large rivers shapefile from https://www.weather.gov/gis/Rivers. Both were converted to GeoJSON.

forecast[['seasonal', 'anomaly', 'prediction']].describe()

	seasonal	anomaly	prediction
count	6454.000000	6454.000000	6454.000000
mean	12.242600	0.017864	12.260464
std	6.945637	1.836547	6.939229
min	-9.059694	-8.743582	0.000000
25%	7.109645	-0.788977	7.080628
50%	11.700867	0.149350	12.294008
75%	18.104874	1.022540	18.170668
max	33.877230	9.553768	34.791661

states = gpd.read_file("states.json")
states = states.loc[~states["STUSPS"].isin(["AK", "HI", "PR"])]  # CONUS
rivers = gpd.read_file("rivers.json")

def signed_max_abs(series):
    # Find the largest absolute value, preserving sign
    mx = series.max()
    mn = series.min()
    if abs(mx) >= abs(mn):
        return mx
    return mn
def scale_size(series, mn=1, mx=5, ceiling=5):
    series = series.abs()
    series.loc[series > ceiling] = ceiling
    return mn + (series - series.min()) * (mx - mn) / (series.max() - series.min())

fig, axes = plt.subplots(2, 2, sharex=True, sharey=True, figsize=(12, 6), layout='compressed')
for axr in axes:
    for ax in axr:
        states.plot(facecolor="none", edgecolor="#606060", ax=ax, linewidth=0.8, linestyle='dashed')
(forecast.groupby('id')[['lat', 'lon', 'prediction']].mean().
    rename(columns={'prediction': 'Mean Temp [C]'}).
    plot.scatter(
    x='lon', y='lat', c='Mean Temp [C]',
        cmap='plasma', vmin=0, vmax=35, ax=axes[0, 0],
    ylabel='Latitude (deg N)', xlabel='Longitude (deg E)'))
rivers.loc[rivers["STRAHLER"] > 3].plot(ax=axes[0, 0], linewidth=1, edgecolor='teal')
(forecast.groupby('id').agg({'lat': 'mean', 'lon': 'mean', 'anomaly': 'mean'}).
    rename(columns={'anomaly': 'Mean Temp Anomaly [C]'}).
    plot.scatter(
    x='lon', y='lat', c='Mean Temp Anomaly [C]',
        cmap='berlin', vmin=-5, vmax=5, ax=axes[1, 0],
    ylabel='Latitude (deg N)', xlabel='Longitude (deg E)'))
(forecast.groupby('id').agg({'lat': 'mean', 'lon': 'mean', 'anomaly': signed_max_abs}).
    assign(anomsize=lambda x: scale_size(x['anomaly'], mn=3, mx=20)).
    rename(columns={'anomaly': 'Largest Temp Anomaly [C]'}).
    plot.scatter(
    x='lon', y='lat', c='Largest Temp Anomaly [C]', s='anomsize',
        cmap='berlin', vmin=-5, vmax=5, ax=axes[1, 1],
    ylabel='Latitude (deg N)', xlabel='Longitude (deg E)'))
# Compute weekly *trend*
start = forecast.loc[forecast['date'] == forecast['date'].min()].set_index('id')
end = forecast.loc[forecast['date'] == forecast['date'].max()].set_index('id')
start.loc[:, 'Temp Trend [C/wk]'] = end['prediction'] - start['prediction']
start.plot.scatter(x='lon', y='lat', c='Temp Trend [C/wk]',
        cmap='coolwarm', vmin=-5, vmax=5, ax=axes[0, 1],
    ylabel='Latitude (deg N)', xlabel='Longitude (deg E)')
fig.suptitle(f'''TempEst-NEXT National Weekly Stream Water Temperature Forecast
Dates: {forecast['date'].min().date()} - {forecast['date'].max().date()}''')
fig.savefig(dirs[5] + f'ForecastPlot_{today}.png', dpi=300)