1140806 meeting

發佈於 2025-08-06 更新於 2025-09-21 約 1162 字預計閱讀 3 分鐘次閱讀

前言

以下修復使用 dart 模組的 TSMixerModel 進行填補未來 10 天會都是整數的問題，問題源於生成格式未設置正確，導致浮點數資料存入時被存為整數。

修復處為

1
inference = np.array([[[0.0] * real_data.shape[2]] * future_days] * real_data.shape[0])

`TSMixerModel` 預測

程式碼為

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
"""
Title: 載入資料並測試 dart 的 TSMixer 模型
Author: Hsu, Yao-Chih
Version: 1140804
Reference:
"""

# import modules
import os
import numpy as np
import pandas as pd
import xarray as xr
from tqdm import tqdm
import matplotlib.pyplot as plt
import csv
import cartopy.crs as ccrs
import cartopy.feature as cfeature


# functions
def check_data_folder(folder):
    return os.path.exists(folder) and os.path.isdir(folder)

def generate_date_range(start_date, end_date):
    """
    Generate a list of dates from start_date to end_date.
    """
    return pd.date_range(start=start_date, end=end_date, freq='D').strftime('%Y%m%d').tolist()

def load_data(file_path):
    """
    Load data from a NetCDF file.
    """
    if os.path.exists(file_path):
        return xr.open_dataset(file_path)
    else:
        raise FileNotFoundError(f'File not found: {file_path}')


# main program
## check data folder
data_folder = f'..\\..\\..\\surface_air_temperature\\data2024'
if check_data_folder(data_folder):
    print(f'Data folder found: {data_folder}')
else:
    raise FileNotFoundError(f'Data folder not found: {data_folder}')

## load data
start_date = '2024-01-01'
end_date = '2024-12-31'
date_list = generate_date_range(start_date, end_date)

## get location and shape
path = os.path.join(data_folder, f'M2T1NXFLX.5.12.4%3AMERRA2_400.tavg1_2d_flx_Nx.{date_list[0]}.nc4.dap.nc4')
nc4_data = load_data(path)
lat = nc4_data['lat'].values
lon = nc4_data['lon'].values
shape = nc4_data['TLML'].shape
total_locations = shape[1] * shape[2]

## combine data
sample_path = os.path.join(data_folder, f'M2T1NXFLX.5.12.4%3AMERRA2_400.tavg1_2d_flx_Nx.{date_list[0]}.nc4.dap.nc4')
sample_data = load_data(sample_path)
shape_per_file = sample_data['TLML'].shape 
time_per_file = len(sample_data['time'])

total_samples = len(date_list)
combined = np.empty((total_samples * shape_per_file[0], *shape_per_file[1:]), dtype=np.float32)
time_list = np.empty(total_samples * time_per_file, dtype=sample_data['time'].dtype)

for i, date in enumerate(tqdm(date_list, desc="Combining")):
    path = os.path.join(data_folder, f'M2T1NXFLX.5.12.4%3AMERRA2_400.tavg1_2d_flx_Nx.{date}.nc4.dap.nc4')
    nc4_data = load_data(path)

    start = i * shape_per_file[0]
    end = (i + 1) * shape_per_file[0]

    combined[start:end] = nc4_data['TLML'].values
    time_list[start:end] = nc4_data['time'].values

print(f'Combined data shape: {combined.shape}')

## reshape data
locations = np.stack(np.meshgrid(lon, lat), axis=-1).reshape(-1, 2)
reshaped_data = combined.reshape(combined.shape[0], -1)
pd.DataFrame(reshaped_data)  # 2d data with time as rows and locations as columns
reshaped_df = pd.DataFrame(reshaped_data, columns=[f"({lon}, {lat})" for lon, lat in locations], index=list(time_list))
time_num = len(time_list)
locations_num = len(locations)

## reshape data to (24, day, location)  (24 hours)
reshaped_df.index = pd.to_datetime(reshaped_df.index)
groups = reshaped_df.groupby(reshaped_df.index.time)
stacked = np.stack([group.to_numpy() for time, group in sorted(groups)])
print(stacked)
time_order = sorted(groups.groups.keys())
print(time_order)

## train set
np.random.seed(123)
valid_mask = (locations[:, 0] >= 73) & (locations[:, 0] <= 104) & \
             (locations[:, 1] >= 36) & (locations[:, 1] <= 54)  # 緯度
valid_indices = np.where(valid_mask)[0]


day_num_train = 250
known_locations_num = valid_indices.shape[0] - 350  # 81.1%
unknown_locations_num = 350                            # 18.9%
locations_num = known_locations_num + unknown_locations_num

locations_index = np.random.choice(valid_indices, size=locations_num, replace=False)
known_locations_index = locations_index[:known_locations_num]
unknown_locations_index = locations_index[known_locations_num:(known_locations_num + unknown_locations_num)]
known_locations_index.sort()
unknown_locations_index.sort()
known_locations_choose = locations[known_locations_index, :]
unknown_locations_choose = locations[unknown_locations_index, :]
stacked_train = stacked[:, :day_num_train, known_locations_index]

future_days = 10
known_real_data = stacked[:, :day_num_train + future_days, known_locations_index]
unknown_real_data = stacked[:, :day_num_train + future_days, unknown_locations_index]

print(f'Load data complete!')


# pip install u8darts[all]
from darts.models import TSMixerModel
from darts import TimeSeries
from datetime import datetime
from tqdm import tqdm
real_data = np.concatenate((known_real_data, unknown_real_data), axis=2)
locations_choose = np.concatenate((known_locations_choose, unknown_locations_choose), axis=0)
locations_index = np.concatenate((known_locations_index, unknown_locations_index), axis=0)  # (1850,)
inference = np.array([[[0.0] * real_data.shape[2]] * future_days] * real_data.shape[0])
start_time = datetime.now()

for i in tqdm(range(real_data.shape[0])):
    train_set = real_data[i, :day_num_train, :]
    train_set = pd.DataFrame(train_set, index=pd.to_datetime(date_list[:day_num_train]), columns=[f"loc_{i}" for i in range(locations_num)])
    train_set = TimeSeries.from_dataframe(train_set)

    test_set = real_data[i, day_num_train:, :]
    test_set = pd.DataFrame(test_set, index=pd.to_datetime(date_list[day_num_train:day_num_train + future_days]), columns=[f"loc_{i}" for i in range(locations_num)])
    test_set = TimeSeries.from_dataframe(test_set)

    model = TSMixerModel(
        input_chunk_length=30,  
        output_chunk_length=10, 
        n_epochs=3800,          
        dropout=0.0005,         
        batch_size=32,
        use_reversible_instance_norm=True,
        random_state=42
    )

    model.fit(train_set)

    forecast = model.predict(future_days)
    inference[i] = forecast.values()

print(f'Inference complete! Time taken: {datetime.now() - start_time}')

y_true = real_data[:, day_num_train:, :]
y_pred = inference

mspe = np.mean(np.square(y_pred - y_true))
rmspe = np.sqrt(mspe)
mape = mspe = np.mean(np.abs(y_pred - y_true))

mspe_pct = np.mean(np.square(y_pred - y_true) / y_true)
rmspe_pct = np.sqrt(mspe_pct)
mape_pct = mspe_pct = np.mean(np.abs(y_pred - y_true) / y_true)


result_df = pd.DataFrame({
    "Method": ["MSPE", "RMSPE", "MAPE", "MSPE%", "RMSPE%", "MAPE%"],
    "Value": [mspe, rmspe, mape, mspe_pct, rmspe_pct, mape_pct]
})

print(result_df)

gallery_made_with_nanogallery_TSMixerModel

用時 7:02:48.156906 （CPU）。

Method	Value
MSPE	7.381319
RMSPE	8.507541
MAPE	7.381319
MSPE%	0.026069
RMSPE%	0.506345
MAPE%	0.026069

用時 1:52:58.737929 （GPU）。

Method	Value
MSPE	7.203228
RMSPE	8.319097
MAPE	7.203228
MSPE%	0.025429
RMSPE%	0.494964
MAPE%	0.025429

$SSSD^{S4}$ 預測

gallery_made_with_nanogallery_SSSDS4

Method	Value
MSPE	3.887944
RMSPE	4.844090
MAPE	3.887944
MSPE%	0.013749
RMSPE%	0.288593
MAPE%	0.013749

結語

https://raw.githubusercontent.com/Josh-test-lab/website-assets-repository/refs/heads/main/posts/1140806%20meeting/To%20be%20continued.jpg — To be continued!

運行環境

本機作業系統：Windows 11 24H2
- 程式語言：Python 3.12.9
計算平臺：財團法人國家實驗研究院國家高速網路與計算中心臺灣 AI 雲
- 作業系統：Ubuntu
- Miniconda
- GPU：NVIDIA Tesla V100 32GB GPU
- CUDA 12.8 driver
- 程式語言：Python 3.10.16 for Linux

延伸學習

我測試此項目的 Github 儲存庫。

參考資料

Juan Lopez Alcaraz 、 Nils Strodthoff（2022）。Diffusion-based time series imputation and forecasting with structured state space models。Transactions on Machine Learning Research。參考自 https://openreview.net/forum?id=hHiIbk7ApW
SSSD（2022）。GitHub。參考自 https://github.com/AI4HealthUOL/SSSD
SSSD_CP（2024）。GitHub。參考自 https://github.com/egpivo/SSSD_CP

目錄