1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
| """
Title: 載入資料並測試 dart 的 RegressionEnsemble 模型
Author: Hsu, Yao-Chih
Version: 1140804
Reference:
"""
# import modules
import os
import numpy as np
import pandas as pd
import xarray as xr
from tqdm import tqdm
import matplotlib.pyplot as plt
import csv
import cartopy.crs as ccrs
import cartopy.feature as cfeature
# functions
def check_data_folder(folder):
return os.path.exists(folder) and os.path.isdir(folder)
def generate_date_range(start_date, end_date):
"""
Generate a list of dates from start_date to end_date.
"""
return pd.date_range(start=start_date, end=end_date, freq='D').strftime('%Y%m%d').tolist()
def load_data(file_path):
"""
Load data from a NetCDF file.
"""
if os.path.exists(file_path):
return xr.open_dataset(file_path)
else:
raise FileNotFoundError(f'File not found: {file_path}')
# main program
## check data folder
data_folder = f'..\\..\\..\\surface_air_temperature\\data2024'
if check_data_folder(data_folder):
print(f'Data folder found: {data_folder}')
else:
raise FileNotFoundError(f'Data folder not found: {data_folder}')
## load data
start_date = '2024-01-01'
end_date = '2024-12-31'
date_list = generate_date_range(start_date, end_date)
## get location and shape
path = os.path.join(data_folder, f'M2T1NXFLX.5.12.4%3AMERRA2_400.tavg1_2d_flx_Nx.{date_list[0]}.nc4.dap.nc4')
nc4_data = load_data(path)
lat = nc4_data['lat'].values
lon = nc4_data['lon'].values
shape = nc4_data['TLML'].shape
total_locations = shape[1] * shape[2]
## combine data
sample_path = os.path.join(data_folder, f'M2T1NXFLX.5.12.4%3AMERRA2_400.tavg1_2d_flx_Nx.{date_list[0]}.nc4.dap.nc4')
sample_data = load_data(sample_path)
shape_per_file = sample_data['TLML'].shape
time_per_file = len(sample_data['time'])
total_samples = len(date_list)
combined = np.empty((total_samples * shape_per_file[0], *shape_per_file[1:]), dtype=np.float32)
time_list = np.empty(total_samples * time_per_file, dtype=sample_data['time'].dtype)
for i, date in enumerate(tqdm(date_list, desc="Combining")):
path = os.path.join(data_folder, f'M2T1NXFLX.5.12.4%3AMERRA2_400.tavg1_2d_flx_Nx.{date}.nc4.dap.nc4')
nc4_data = load_data(path)
start = i * shape_per_file[0]
end = (i + 1) * shape_per_file[0]
combined[start:end] = nc4_data['TLML'].values
time_list[start:end] = nc4_data['time'].values
print(f'Combined data shape: {combined.shape}')
## reshape data
locations = np.stack(np.meshgrid(lon, lat), axis=-1).reshape(-1, 2)
reshaped_data = combined.reshape(combined.shape[0], -1)
pd.DataFrame(reshaped_data) # 2d data with time as rows and locations as columns
reshaped_df = pd.DataFrame(reshaped_data, columns=[f"({lon}, {lat})" for lon, lat in locations], index=list(time_list))
time_num = len(time_list)
locations_num = len(locations)
## reshape data to (24, day, location) (24 hours)
reshaped_df.index = pd.to_datetime(reshaped_df.index)
groups = reshaped_df.groupby(reshaped_df.index.time)
stacked = np.stack([group.to_numpy() for time, group in sorted(groups)])
print(stacked)
time_order = sorted(groups.groups.keys())
print(time_order)
## train set
np.random.seed(123)
valid_mask = (locations[:, 0] >= 73) & (locations[:, 0] <= 104) & \
(locations[:, 1] >= 36) & (locations[:, 1] <= 54)
valid_indices = np.where(valid_mask)[0]
day_num_train = 250
known_locations_num = valid_indices.shape[0] - 350 # 81.1%
unknown_locations_num = 350 # 18.9%
locations_num = known_locations_num + unknown_locations_num
locations_index = np.random.choice(valid_indices, size=locations_num, replace=False)
known_locations_index = locations_index[:known_locations_num]
unknown_locations_index = locations_index[known_locations_num:(known_locations_num + unknown_locations_num)]
known_locations_index.sort()
unknown_locations_index.sort()
known_locations_choose = locations[known_locations_index, :]
unknown_locations_choose = locations[unknown_locations_index, :]
stacked_train = stacked[:, :day_num_train, known_locations_index]
future_days = 10
known_real_data = stacked[:, :day_num_train + future_days, known_locations_index]
unknown_real_data = stacked[:, :day_num_train + future_days, unknown_locations_index]
print(f'Load data complete!')
from darts.models import RegressionEnsembleModel, NaiveSeasonal, LinearRegressionModel, NaiveDrift
from darts import TimeSeries
from datetime import datetime
from tqdm import tqdm
real_data = np.concatenate((known_real_data, unknown_real_data), axis=2)
location_choose = np.concatenate((known_locations_choose, unknown_locations_choose), axis=0)
locations_index = np.concatenate((known_locations_index, unknown_locations_index), axis=0)
inference = np.array([[[0.0] * real_data.shape[2]] * future_days] * real_data.shape[0])
start_time = datetime.now()
for i in tqdm(range(real_data.shape[0])):
train_set = real_data[i, :day_num_train, :]
train_set = pd.DataFrame(train_set, index=pd.to_datetime(date_list[:day_num_train]), columns=[f"loc_{i}" for i in range(locations_num)])
train_set = TimeSeries.from_dataframe(train_set)
test_set = real_data[i, day_num_train:, :]
test_set = pd.DataFrame(test_set, index=pd.to_datetime(date_list[day_num_train:day_num_train + future_days]), columns=[f"loc_{i}" for i in range(locations_num)])
test_set = TimeSeries.from_dataframe(test_set)
base_models = [
NaiveSeasonal(K=7),
LinearRegressionModel(lags=30),
NaiveDrift()
]
model = RegressionEnsembleModel(
forecasting_models=base_models,
regression_train_n_points=30
)
model.fit(train_set)
forecast = model.predict(n=future_days)
inference[i] = forecast.values()
print(f'Inference complete! Time taken: {datetime.now() - start_time}')
y_true = real_data[:, day_num_train:, :]
y_pred = inference
mspe = np.mean(np.square(y_pred - y_true))
rmspe = np.sqrt(mspe)
mape = mspe = np.mean(np.abs(y_pred - y_true))
mspe_pct = np.mean(np.square(y_pred - y_true) / y_true)
rmspe_pct = np.sqrt(mspe_pct)
mape_pct = mspe_pct = np.mean(np.abs(y_pred - y_true) / y_true)
result_df = pd.DataFrame({
"Method": ["MSPE", "RMSPE", "MAPE", "MSPE%", "RMSPE%", "MAPE%"],
"Value": [mspe, rmspe, mape, mspe_pct, rmspe_pct, mape_pct]
})
print(result_df)
|