目錄

1140807 meeting

前言

以下使用 dart 模組的 RegressionEnsembleModel 進行填補未來 10 天,可以與 1140806 meeting 的結果做比較。

RegressionEnsembleModel 預測

程式碼為

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
"""
Title: 載入資料並測試 dart 的 RegressionEnsemble 模型
Author: Hsu, Yao-Chih
Version: 1140804
Reference:
"""

# import modules
import os
import numpy as np
import pandas as pd
import xarray as xr
from tqdm import tqdm
import matplotlib.pyplot as plt
import csv
import cartopy.crs as ccrs
import cartopy.feature as cfeature


# functions
def check_data_folder(folder):
    return os.path.exists(folder) and os.path.isdir(folder)

def generate_date_range(start_date, end_date):
    """
    Generate a list of dates from start_date to end_date.
    """
    return pd.date_range(start=start_date, end=end_date, freq='D').strftime('%Y%m%d').tolist()

def load_data(file_path):
    """
    Load data from a NetCDF file.
    """
    if os.path.exists(file_path):
        return xr.open_dataset(file_path)
    else:
        raise FileNotFoundError(f'File not found: {file_path}')


# main program
## check data folder
data_folder = f'..\\..\\..\\surface_air_temperature\\data2024'
if check_data_folder(data_folder):
    print(f'Data folder found: {data_folder}')
else:
    raise FileNotFoundError(f'Data folder not found: {data_folder}')

## load data
start_date = '2024-01-01'
end_date = '2024-12-31'
date_list = generate_date_range(start_date, end_date)

## get location and shape
path = os.path.join(data_folder, f'M2T1NXFLX.5.12.4%3AMERRA2_400.tavg1_2d_flx_Nx.{date_list[0]}.nc4.dap.nc4')
nc4_data = load_data(path)
lat = nc4_data['lat'].values
lon = nc4_data['lon'].values
shape = nc4_data['TLML'].shape
total_locations = shape[1] * shape[2]

## combine data
sample_path = os.path.join(data_folder, f'M2T1NXFLX.5.12.4%3AMERRA2_400.tavg1_2d_flx_Nx.{date_list[0]}.nc4.dap.nc4')
sample_data = load_data(sample_path)
shape_per_file = sample_data['TLML'].shape
time_per_file = len(sample_data['time'])

total_samples = len(date_list)
combined = np.empty((total_samples * shape_per_file[0], *shape_per_file[1:]), dtype=np.float32)
time_list = np.empty(total_samples * time_per_file, dtype=sample_data['time'].dtype)

for i, date in enumerate(tqdm(date_list, desc="Combining")):
    path = os.path.join(data_folder, f'M2T1NXFLX.5.12.4%3AMERRA2_400.tavg1_2d_flx_Nx.{date}.nc4.dap.nc4')
    nc4_data = load_data(path)

    start = i * shape_per_file[0]
    end = (i + 1) * shape_per_file[0]

    combined[start:end] = nc4_data['TLML'].values
    time_list[start:end] = nc4_data['time'].values

print(f'Combined data shape: {combined.shape}')

## reshape data
locations = np.stack(np.meshgrid(lon, lat), axis=-1).reshape(-1, 2)
reshaped_data = combined.reshape(combined.shape[0], -1)
pd.DataFrame(reshaped_data)  # 2d data with time as rows and locations as columns
reshaped_df = pd.DataFrame(reshaped_data, columns=[f"({lon}, {lat})" for lon, lat in locations], index=list(time_list))
time_num = len(time_list)
locations_num = len(locations)

## reshape data to (24, day, location)  (24 hours)
reshaped_df.index = pd.to_datetime(reshaped_df.index)
groups = reshaped_df.groupby(reshaped_df.index.time)
stacked = np.stack([group.to_numpy() for time, group in sorted(groups)])
print(stacked)
time_order = sorted(groups.groups.keys())
print(time_order)

## train set
np.random.seed(123)
valid_mask = (locations[:, 0] >= 73) & (locations[:, 0] <= 104) & \
             (locations[:, 1] >= 36) & (locations[:, 1] <= 54) 
valid_indices = np.where(valid_mask)[0]


day_num_train = 250
known_locations_num = valid_indices.shape[0] - 350  # 81.1%
unknown_locations_num = 350                            # 18.9%
locations_num = known_locations_num + unknown_locations_num

locations_index = np.random.choice(valid_indices, size=locations_num, replace=False)
known_locations_index = locations_index[:known_locations_num]
unknown_locations_index = locations_index[known_locations_num:(known_locations_num + unknown_locations_num)]
known_locations_index.sort()
unknown_locations_index.sort()
known_locations_choose = locations[known_locations_index, :]
unknown_locations_choose = locations[unknown_locations_index, :]
stacked_train = stacked[:, :day_num_train, known_locations_index]

future_days = 10
known_real_data = stacked[:, :day_num_train + future_days, known_locations_index]
unknown_real_data = stacked[:, :day_num_train + future_days, unknown_locations_index]

print(f'Load data complete!')


from darts.models import RegressionEnsembleModel, NaiveSeasonal, LinearRegressionModel, NaiveDrift
from darts import TimeSeries
from datetime import datetime
from tqdm import tqdm
real_data = np.concatenate((known_real_data, unknown_real_data), axis=2)
location_choose = np.concatenate((known_locations_choose, unknown_locations_choose), axis=0)
locations_index = np.concatenate((known_locations_index, unknown_locations_index), axis=0)
inference = np.array([[[0.0] * real_data.shape[2]] * future_days] * real_data.shape[0])
start_time = datetime.now()

for i in tqdm(range(real_data.shape[0])):
    train_set = real_data[i, :day_num_train, :]
    train_set = pd.DataFrame(train_set, index=pd.to_datetime(date_list[:day_num_train]), columns=[f"loc_{i}" for i in range(locations_num)])
    train_set = TimeSeries.from_dataframe(train_set)

    test_set = real_data[i, day_num_train:, :]
    test_set = pd.DataFrame(test_set, index=pd.to_datetime(date_list[day_num_train:day_num_train + future_days]), columns=[f"loc_{i}" for i in range(locations_num)])
    test_set = TimeSeries.from_dataframe(test_set)

    base_models = [
        NaiveSeasonal(K=7),
        LinearRegressionModel(lags=30),
        NaiveDrift()
    ]

    model = RegressionEnsembleModel(
        forecasting_models=base_models,
        regression_train_n_points=30
    )

    model.fit(train_set)

    forecast = model.predict(n=future_days)
    inference[i] = forecast.values()

print(f'Inference complete! Time taken: {datetime.now() - start_time}')

y_true = real_data[:, day_num_train:, :]
y_pred = inference

mspe = np.mean(np.square(y_pred - y_true))
rmspe = np.sqrt(mspe)
mape = mspe = np.mean(np.abs(y_pred - y_true))

mspe_pct = np.mean(np.square(y_pred - y_true) / y_true)
rmspe_pct = np.sqrt(mspe_pct)
mape_pct = mspe_pct = np.mean(np.abs(y_pred - y_true) / y_true)

result_df = pd.DataFrame({
    "Method": ["MSPE", "RMSPE", "MAPE", "MSPE%", "RMSPE%", "MAPE%"],
    "Value": [mspe, rmspe, mape, mspe_pct, rmspe_pct, mape_pct]
})

print(result_df)

參數說明如下

模型 / 參數功能說明
NaiveSeasonal(K=7)一種簡單的季節性預測方法,預測未來值為「前 K 天前的值」。適合每週有重複模式的資料(如一週 7 天)。
LinearRegressionModel(lags=30)使用過去 30 個時間點的值作為輸入特徵來訓練線性迴歸模型。支援多變量。
NaiveDrift()預測趨勢線性延伸,也就是最近兩點之間的斜率會被套用到未來。
regression_train_n_points=30在訓練迴歸模型(即 ensemble 的「合併模型」)時,使用 最近 30 個時間點 的 base models 預測結果來擬合真實值。

用時 0:02:44.625559 (CPU)。

gallery_made_with_nanogallery_RegressionEnsembleModel
MethodValue
MSPE4.372672
RMSPE5.466198
MAPE4.372672
MSPE%0.015402
RMSPE%0.324543
MAPE%0.015402

結語

https://raw.githubusercontent.com/Josh-test-lab/website-assets-repository/refs/heads/main/posts/1140806%20meeting/To%20be%20continued.jpg
To be continued!

運行環境

  • 作業系統:Windows 11 24H2
  • 程式語言:Python 3.12.9

延伸學習

參考資料