目錄

1140429 meeting

前言

本次使用環境部空氣品質即時資料進行 $SSSD^{S4}$ 模型的研究。在此之前,由於需要訓練模型,且訓練集需為具有時間序列且無缺值的數值資料,故先使用 autoFRK 模型,透過測站的空間相關性將資料填補後,再分為訓練集與測試集送入 $SSSD^{S4}$ 模型訓練與填補。

autoFRK

載入資料與重新分組。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
library(dplyr)
data = read.csv("即時值查詢.csv", sep = ",", skip = 2, header = TRUE, fileEncoding = "big5")

stations <- unique(data[[1]])
dates <- seq.Date(from = min(as.Date(data[[2]])), to = max(as.Date(data[[2]])), by = "day") %>% format("%Y/%m/%d") %>% as.character()
pollutants <- unique(data[[3]])

cat(paste0("length of data:\n  stations: ", length(stations),
             ", dates: ", length(dates),
             ", total times: ", length(dates) * 24,
             ", pollutants: ", length(pollutants)
             ))

data_dict <- list()
for (pollutant in pollutants) {
  data_dict[[pollutant]] <- list()
  for (station in stations) {
    data_dict[[pollutant]][[station]] <- list()
    for (date in dates) {
      data_dict[[pollutant]][[station]][[as.character(date)]] <- rep(NA, 24)
    }
  }
}

for (i in seq_len(nrow(data))) {
  station <- data[i, 1]
  date <- data[i, 2]
  pollutant <- data[i, 3]
  values <- suppressWarnings(as.numeric(data[i, 4:27]))
  data_dict[[pollutant]][[station]][[as.character(date)]] <- values
}
執行結果參考
1
2
length of data:
  stations: 26, dates: 464, total times: 11136, pollutants: 5

將資料轉換為 $SSSD^{S4}$ 模型的輸入形狀(將日期與時間依照順序合併)。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
pollutant_dict = list()

for (pollutant in pollutants) {
  station_list = c()
  
  for (station in stations) {
    date_list = c()
    
    for (date in dates) {
      date_list = append(date_list, data_dict[['CO']][[station]][[date]])
    }
    
    station_list = rbind(station_list, date_list)
  }
  
  rownames(station_list) = stations
  pollutant_dict[[pollutant]] = station_list
}

pollutant_dict$CO[, 1:10]

測項 CO 於各測站的前 10 筆資料值。

執行結果參考
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
基隆   0.35 0.37 0.38 0.39 0.39 0.39 0.41 0.40 0.39  0.37
士林   0.35 0.37 0.37 0.39 0.40 0.40 0.39 0.39 0.39  0.38
大同   0.48 0.56 0.47 0.46 0.45 0.44 0.48 0.53 0.48  0.49
中山   0.41 0.43 0.42 0.42 0.42 0.42 0.46 0.50 0.46  0.45
古亭   0.37 0.35 0.35 0.37 0.39 0.39 0.38 0.39 0.41  0.39
松山   0.36 0.37 0.38 0.37 0.38 0.38 0.39 0.41 0.40  0.38
陽明   0.36 0.38 0.39 0.41 0.42 0.42 0.41 0.41 0.40  0.36
萬華   0.40 0.43 0.43 0.42 0.43 0.43 0.44 0.45 0.45  0.43
三重   0.63 0.72 0.62 0.60 0.54 0.61 0.78 0.74 0.80  0.81
土城   0.36 0.37 0.37 0.37 0.39 0.40 0.41 0.43 0.43  0.43
永和   0.43 0.40 0.40 0.40 0.42 0.45 0.46 0.50 0.50  0.47
汐止   0.32 0.35 0.35 0.36 0.37 0.38 0.40 0.41 0.38  0.37
板橋   0.40 0.41 0.40 0.41 0.42 0.43 0.45 0.46 0.47  0.45
林口   0.36 0.37 0.38 0.38 0.39 0.40 0.40 0.41 0.40  0.39
淡水   0.33 0.35 0.37 0.38 0.39 0.40 0.40 0.41 0.40  0.35
菜寮   0.40 0.39 0.40 0.41 0.42 0.42 0.43 0.45 0.45  0.43
新店   0.35 0.35 0.35 0.36 0.37 0.39 0.40 0.42 0.42  0.42
新莊   0.34 0.36 0.37 0.37 0.39 0.39 0.40 0.41 0.41  0.39
萬里   0.36 0.38 0.39 0.40 0.40 0.41 0.40 0.39 0.37  0.35
富貴角 0.31 0.31 0.33 0.34 0.35 0.33 0.33 0.32 0.29  0.27
大園   0.35 0.35 0.37 0.38 0.40 0.41 0.40 0.41 0.42  0.40
中壢   0.44 0.47 0.44 0.46 0.43 0.49 0.52 0.58 0.59  0.51
平鎮   0.36 0.39 0.40 0.41 0.42 0.44 0.48 0.48 0.46  0.45
桃園   0.35 0.37 0.40 0.39 0.40 0.42 0.45 0.46 0.44  0.43
龍潭   0.36 0.33 0.33 0.34 0.35 0.38 0.40 0.40 0.40  0.40
觀音   0.30 0.33 0.35 0.35 0.36 0.36 0.37 0.37 0.35  0.31

取得各測站坐標

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
location = read.table('station_locations.csv', header = TRUE, sep = ',')
print(location)

station_location = data.frame(station = stations, lon = NA, lat = NA)
for (i in 1:length(stations)) {
  for (j in 1: length(location[, 1])) {
    if (stations[i] == location[j, 1]){
      station_location[i, 2:3] = location[j, 2:3]
      break
    }
  }
}

station_location
執行結果參考
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13

station     lon         lat
<chr>      <dbl>       <dbl>
基隆	121.7601	25.12917
士林	121.5167	25.10334
大同	121.5134	25.06331
中山	121.5265	25.06236
古亭	121.5296	25.02061
松山	121.5786	25.05000
陽明	121.5296	25.18272
萬華	121.5080	25.04650
三重	121.4938	25.07261
土城	121.4519	24.98253

使用 autoFRK 進行填補。模型訓練時,越靠後的時間點使用越多的資料進行訓練並預測,且只填入缺值。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
library(autoFRK)
pollutant_dict.filled = list()
m = length(pollutants)
n = dim(pollutant_dict[[names(pollutant_dict)[1]]])[2]
filled_data = c()

start_time <- Sys.time()
# progress bar
pb = txtProgressBar(min = 0, max = m * n, style = 3)

for (i in 1:m) {
  station_data = c()
  for (j in 1:n) {
    row_data = as.matrix(pollutant_dict[[names(pollutant_dict)[i]]][ ,j])
    na_factor_data = is.na(row_data)
    
    if (sum(na_factor_data) == 0) {
      station_data = as.matrix(cbind(station_data, row_data))
      
      model = autoFRK(data = station_data,
                      loc = as.matrix(station_location[, c("lon", "lat")])
                      )
    } else {
      filled_data = predict.FRK(object = model,
                                obsData = row_data
                                )
      row_data[na_factor_data] = filled_data$pred.value[na_factor_data] 
      station_data = cbind(station_data, row_data)
    }
    
    setTxtProgressBar(pb, ((i - 1) * n + j))  # progress bar
    
    elapsed_time <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
    row_data[na_factor_data] <- filled_data$pred.value[na_factor_data]
    avg_time_per_iteration <- elapsed_time / ((i - 1) * n + j)
    expected_remaining_time <- avg_time_per_iteration * (m * n - ((i - 1) * n + j))
    cat(sprintf("\r%d / %d items, %.2f / %.2f seconds", ((i - 1) * n + j), m * n, expected_remaining_time, elapsed_time))
    flush.console()
  }
  
  pollutant_dict.filled[[names(pollutant_dict)[i]]] = station_data
}
  
close(pb)  # progress bar
執行結果參考
1
55680 / 55680 items, 0.00 / 4472.01 secondss===========================================================================| 100%

儲存到 .npy 格式檔案。

1
2
3
4
5
6
7
8
9
library(reticulate)
np <- import("numpy")
npy_array <- array(NA, dim = c(length(pollutants), nrow(pollutant_dict.filled[[1]]), ncol(pollutant_dict.filled[[1]])))

for (i in seq_along(pollutants)) {
  npy_array[i, , ] <- as.matrix(pollutant_dict.filled[[pollutants[i]]])
}

np$save("pollutants.npy", npy_array)

至此,就整理且填補完檔案。

拆解測試集

在丟入 $SSSD^{S4}$ 模型訓練前,我們可以先檢視這筆資料集。

首先,載入模組與設定一些畫圖函數。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
# import modules
import numpy as np
import matplotlib.pyplot as plt

# functions
def test_plot(full_data, missing_data, title, save_path):
    """
    Plot the first 3 dimensions of the first sample, comparing full data and missing data.
    """
    fig, axes = plt.subplots(3, 1, figsize=(12, 8), sharex=True)
    for j in range(3):
        axes[j].plot(full_data[0, j], color='gray', label='full data', alpha=0.6)
        axes[j].plot(missing_data[0, j], color='red', label=title)
        axes[j].set_ylabel(f'Dim {j}')
        axes[j].legend()

    plt.suptitle(title)
    plt.xlabel('Time')
    plt.tight_layout()
    plt.savefig(f"{save_path}.png", dpi=300)
    plt.show()

載入並拆分資料集至訓練集與測試集。此處選擇後 500 筆資料為測試集,後 2,500 至 500 筆資料為訓練集。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
## load data
filename = r'real_time\pollutants.npy'
data = np.load(filename)

train_data = data[:, :, -2500:-500]
test_data = data[:, :, -500:]

train_data = train_data.transpose(0, 2, 1)
test_data = test_data.transpose(0, 2, 1)

print(train_data.shape)
print(test_data.shape)

np.save(r'real_time\pollutants_train.npy', train_data)
np.save(r'real_time\pollutants_test.npy', test_data)

現在,將測試集調整為 rm, rbm, bm, tf 的缺失方法,並取其中首三個測站作圖。每一種缺值方法都取 $500 \times 0.4 = 200$ 筆資料為缺失。

各圖中,灰色為無缺值的測試集資料,紅色為各種不同測站欲送入填補的資料。

rm

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
## rm
missing_rate = 0.4
test_data_rm = data[:, :, -500:].copy()
mask = np.random.rand(*test_data_rm.shape) > missing_rate
test_data_rm[~mask] = np.nan

print(test_data_rm[0, 0])
test_plot(test_data.transpose(0, 2, 1), test_data_rm, 'test_data_rm', r'real_time\pollutants_test_rm')

test_data_rm = test_data_rm.transpose(0, 2, 1)
print(test_data_rm.shape)
np.save(r'real_time\pollutants_test_rm.npy', test_data_rm)

https://raw.githubusercontent.com/Josh-test-lab/website-assets-repository/refs/heads/main/posts/1140429%20meeting/pollutants_test_rm.png
rm 。

rbm

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
## rbm
missing_rate = 0.4
missing = int(500 * missing_rate)
test_data_rbm = data[:, :, -500:].copy()

for i in range(test_data_rbm.shape[0]):
    for j in range(test_data_rbm.shape[1]):
        start = np.random.randint(0, 500 - missing + 1)
        end = start + missing
        test_data_rbm[i, j, start:end] = np.nan
        
print(test_data_rbm[0, 0])
print(test_data_rbm[0, 1])
test_plot(test_data.transpose(0, 2, 1), test_data_rbm, 'test_data_rbm', r'real_time\pollutants_test_rbm')

test_data_rbm = test_data_rbm.transpose(0, 2, 1)
print(test_data_rbm.shape)
np.save(r'real_time\pollutants_test_rbm.npy', test_data_rbm)

https://raw.githubusercontent.com/Josh-test-lab/website-assets-repository/refs/heads/main/posts/1140429%20meeting/pollutants_test_rbm.png
rbm 。

bm

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
## bm
missing_rate = 0.4
missing = int(500 * missing_rate)
test_data_bm = data[:, :, -500:].copy()

start = np.random.randint(0, 500 - missing + 1)
end = start + missing
test_data_bm[:, :, start:end] = np.nan

print(test_data_bm[0, 0])
print(test_data_bm[0, 1])
test_plot(test_data.transpose(0, 2, 1), test_data_bm, 'test_data_bm', r'real_time\pollutants_test_bm')

test_data_bm = test_data_bm.transpose(0, 2, 1)
print(test_data_bm.shape)
np.save(r'real_time\pollutants_test_bm.npy', test_data_bm)

https://raw.githubusercontent.com/Josh-test-lab/website-assets-repository/refs/heads/main/posts/1140429%20meeting/pollutants_test_bm.png
bm 。

tf

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
## tf
missing_rate = 0.4
missing = int(500 * missing_rate)
test_data_tf = data[:, :, -500:].copy()

test_data_tf[:, :, -missing:] = np.nan

print(test_data_tf[0, 0])
print(test_data_tf[0, 1])
test_plot(test_data.transpose(0, 2, 1), test_data_tf, 'test_data_tf', r'real_time\pollutants_test_tf')

test_data_tf = test_data_tf.transpose(0, 2, 1)
print(test_data_tf.shape)
np.save(r'real_time\pollutants_test_tf.npy', test_data_tf)

https://raw.githubusercontent.com/Josh-test-lab/website-assets-repository/refs/heads/main/posts/1140429%20meeting/pollutants_test_tf.png
tf 。

$SSSD^{S4}$

本次資料共計 5 種測項, 26 間測站, $464 \times 24 = 11136$ 個時間點。

經由先前整理,可以得到用於訓練的資料 2000 筆,用於測試的資料 500 筆,且有不同的缺失情況。輸入的資料集形狀為 (測項, 時間, 測站) ,經由填補(imputation)後的輸出形狀為 (測項, 測站, 時間)

用於訓練的設定檔。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
# Training configuration
batch_size: 50  # Batch size
output_directory: "./results/checkpoint"  # Output directory for checkpoints and logs
ckpt_iter: "max"  # Checkpoint mode (max or min)
iters_per_ckpt: 100  # Checkpoint frequency (number of epochs)
iters_per_logging: 100  # Log frequency (number of iterations)
n_iters: 10000  # Maximum number of iterations
learning_rate: 0.002  # Learning rate

# Additional training settings
only_generate_missing: true  # Generate missing values only
use_model: 2  # Model to use for training
masking: "forecast"  # Masking strategy for missing values
missing_k: 5  # Number of missing values

# Data paths
data:
  train_path: "./datasets/real_time_by_autoFRK/pollutants_train.npy"  # Path to training data

模型的設定檔。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
wavenet:
  # WaveNet model parameters
  input_channels: 26  # Number of input channels
  output_channels: 26  # Number of output channels
  residual_layers: 32  # Number of residual layers
  residual_channels: 128  # Number of channels in residual blocks
  skip_channels: 128  # Number of channels in skip connections

  # Diffusion step embedding dimensions
  diffusion_step_embed_dim_input: 64  # Input dimension
  diffusion_step_embed_dim_hidden: 512  # Middle dimension
  diffusion_step_embed_dim_output: 512  # Output dimension

  # Structured State Spaces sequence model (S4) configurations
  s4_max_sequence_length: 2000  # Maximum sequence length
  s4_state_dim: 64  # State dimension
  s4_dropout: 0.0  # Dropout rate
  s4_bidirectional: true  # Whether to use bidirectional layers
  s4_use_layer_norm: true  # Whether to use layer normalization

diffusion:
  # Diffusion model parameters
  T: 200  # Number of diffusion steps
  beta_0: 0.0001  # Initial beta value
  beta_T: 0.02  # Final beta value

用於填補的設定檔。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
# Inference configuration
batch_size: 50  # Batch size for inference
output_directory: "./results/checkpoint/rm"  # Output directory for inference results
ckpt_path: "./results/checkpoint"  # Path to checkpoint for inference
trials: 1 # Replications

# Additional training settings
only_generate_missing: true  # Generate missing values only
use_model: 2  # Model to use for training
masking: "forecast"  # Masking strategy for missing values
missing_k: 200  # Number of missing values

# Data paths
data:
  test_path: "./datasets/real_time_by_autoFRK/pollutants_test_rm.npy"  # Path to test data
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
def get_mask_forecast(sample: torch.Tensor, k: int) -> torch.Tensor:
    """
    Get mask of same segments (black-out missing) across channels based on k.

    Args:
        sample (torch.Tensor): Tensor of shape [# of samples, # of channels].
        k (int): Number of missing values.

    Returns:
        torch.Tensor: Mask of sample's shape where 0's indicate missing values to be imputed, and 1's indicate preserved values.
    """
    #mask = torch.ones_like(sample)  # Initialize mask with all ones

    # Calculate the indices of missing values
    #s_nan = torch.arange(mask.shape[0] - k, mask.shape[0])

    # Apply mask for each channel
    #for channel in range(mask.shape[1]):
    #    mask[s_nan, channel] = 0

    mask = (~torch.isnan(sample)).float()  # replace only missing values

    return mask

其中,訓練時間約為 2 小時,填補時間約為 2 分鐘。

訓練的指令如下。

1
./scripts/diffusion/training_job.sh -m configs/model.yaml -t configs/training.yaml

需要注意的是,原程式碼僅供輸入完整無缺值的資料,這使得填補缺值會失敗。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
(sssd) u6025091@iq0l0ttest1140426-9bhg4:~/SSSD_CP$ ./scripts/diffusion/inference_job.sh -m configs/model.yaml -i configs/inference.yaml
Intializing conda
Activating Conda Env: sssd
[Execution - Inference]
/home/u6025091/SSSD_CP/scripts/diffusion/infer.py --model_config configs/model.yaml --inference_config configs/inference.yaml
2025-04-26 14:28:16,698 - sssd.utils.logger - INFO - Using 1 GPUs!
2025-04-26 14:28:16,939 - sssd.utils.logger - INFO - Current time: 2025-04-26 14:28:16
2025-04-26 14:28:28,592 - sssd.utils.logger - INFO - The 1th inference trial
2025-04-26 14:28:28,592 - sssd.utils.logger - INFO - Output directory: ./results/checkpoint/bm/T200_beta00.0001_betaT0.02/max
2025-04-26 14:28:29,827 - sssd.utils.logger - INFO - Successfully loaded model at iteration 10000
Traceback (most recent call last):
  File "/home/u6025091/SSSD_CP/scripts/diffusion/infer.py", line 117, in <module>
    run_job(model_config, inference_config, device, args.ckpt_iter)
  File "/home/u6025091/SSSD_CP/scripts/diffusion/infer.py", line 96, in run_job
    ).generate()
  File "/home/u6025091/.local/lib/python3.10/site-packages/sssd/inference/generator.py", line 151, in generate
    mse = mean_squared_error(
  File "/home/u6025091/local/envs/sssd/lib/python3.10/site-packages/sklearn/metrics/_regression.py", line 438, in mean_squared_error
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
  File "/home/u6025091/local/envs/sssd/lib/python3.10/site-packages/sklearn/metrics/_regression.py", line 96, in _check_reg_targets
    y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
  File "/home/u6025091/local/envs/sssd/lib/python3.10/site-packages/sklearn/utils/validation.py", line 800, in check_array
    _assert_all_finite(array, allow_nan=force_all_finite == "allow-nan")
  File "/home/u6025091/local/envs/sssd/lib/python3.10/site-packages/sklearn/utils/validation.py", line 114, in _assert_all_finite
    raise ValueError(
ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

此處修改了以下程式碼,讓模型選擇遮罩(mask)時能夠依實際缺值處選擇。此段程式碼位於 /SSSD_CP-main/sssd/core/utils.py

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
def get_mask_forecast(sample: torch.Tensor, k: int) -> torch.Tensor:
    """
    Get mask of same segments (black-out missing) across channels based on k.

    Args:
        sample (torch.Tensor): Tensor of shape [# of samples, # of channels].
        k (int): Number of missing values.

    Returns:
        torch.Tensor: Mask of sample's shape where 0's indicate missing values to be imputed, and 1's indicate preserved values.
    """
    #mask = torch.ones_like(sample)  # Initialize mask with all ones

    # Calculate the indices of missing values
    #s_nan = torch.arange(mask.shape[0] - k, mask.shape[0])

    # Apply mask for each channel
    #for channel in range(mask.shape[1]):
    #    mask[s_nan, channel] = 0

    mask = (~torch.isnan(sample)).float()

    return mask

這裡也需要修改,讓缺值部分填入 0 ,使得程式碼能進行填補( NA 無法進行數值運算,會報錯)。此段程式碼位於 /SSSD_CP-main/sssd/inference/generator.py

新增了 batch = torch.nan_to_num(batch, nan=0.0) # Replace NaN with 0.0

需要注意的是,這會讓原先的 msemspe 失效,需於填補完成後額外計算。此處的程式碼更動僅影響 msemspe ,已確認不影響其它計算,且 msemspe 僅用於推論完後的輸出。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
def generate(self) -> list:
    """Generate samples using the given neural network model."""
    all_mses = []
    all_mapes = []
    for index, (batch,) in enumerate(self.dataloader):
        batch = batch.to(self.device)
        mask = self._update_mask(batch)

        if torch.isnan(mask).any():  # debug
            print(f"[Batch {index}] NaN in mask!")

        batch = torch.nan_to_num(batch, nan=0.0)  # Replace NaN with 0.0

        if torch.isnan(batch).any():  # debug
            print(f"[NaN DETECTED] Batch {index} has NaNs!")

        batch = batch.permute(0, 2, 1)

        generated_series = (
            sampling(
                net=self.net,
                size=batch.shape,
                diffusion_hyperparams=self.diffusion_hyperparams,
                cond=batch,
                mask=mask,
                only_generate_missing=self.only_generate_missing,
                device=self.device,
            )
            .detach()
            .cpu()
            .numpy()
        )

        if np.isnan(generated_series).any():  # debug
            print("[NaN DETECTED] in generated_series")
            print("Locations:", np.argwhere(np.isnan(generated_series)))


        batch = batch.detach().cpu().numpy()
        mask = mask.detach().cpu().numpy()
        mse = mean_squared_error(
            batch[~mask.astype(bool)], generated_series[~mask.astype(bool)]
        )
        mape = mean_absolute_percentage_error(
            batch[~mask.astype(bool)], generated_series[~mask.astype(bool)]
        )
        all_mses.append(mse)
        all_mapes.append(mape)
        results = {
            "imputation": generated_series,
            "original": batch,
            "mask": mask,
        }
        self._save_data(results, index)

    return all_mses, all_mapes
提醒
:以上修改僅用於模型推論時使用,訓練模型時仍需使用原先的程式碼。

調整完後即可使用已下指令進行填補。

1
./scripts/diffusion/inference_job.sh -m configs/model.yaml -i configs/inference.yaml

運行結果,依序為 rm, rbm, bm, 及 tf

執行結果參考
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
(sssd) u6025091@iq0l0ttest1140426-9bhg4:~/SSSD_CP$ ./scripts/diffusion/inference_job.sh -m configs/model.yaml -i configs/inference.yaml
Intializing conda
Activating Conda Env: sssd
[Execution - Inference]
/home/u6025091/SSSD_CP/scripts/diffusion/infer.py --model_config configs/model.yaml --inference_config configs/inference.yaml
2025-04-26 15:05:30,600 - sssd.utils.logger - INFO - Using 1 GPUs!
2025-04-26 15:05:30,749 - sssd.utils.logger - INFO - Current time: 2025-04-26 15:05:30
2025-04-26 15:05:41,170 - sssd.utils.logger - INFO - The 1th inference trial
2025-04-26 15:05:41,171 - sssd.utils.logger - INFO - Output directory: ./results/checkpoint/rm/T200_beta00.0001_betaT0.02/max
2025-04-26 15:05:41,544 - sssd.utils.logger - INFO - Successfully loaded model at iteration 10000
2025-04-26 15:06:10,954 - sssd.utils.logger - INFO - Average MSE: 13.989262580871582
2025-04-26 15:06:10,955 - sssd.utils.logger - INFO - Average MAPE: 6969775217442816.0
2025-04-26 15:06:10,955 - sssd.utils.logger - INFO - Current time: 2025-04-26 15:06:10
Inference Job completed
(sssd) u6025091@iq0l0ttest1140426-9bhg4:~/SSSD_CP$ ./scripts/diffusion/inference_job.sh -m configs/model.yaml -i configs/inference.yaml
Intializing conda
Activating Conda Env: sssd
[Execution - Inference]
/home/u6025091/SSSD_CP/scripts/diffusion/infer.py --model_config configs/model.yaml --inference_config configs/inference.yaml
2025-04-26 15:06:22,825 - sssd.utils.logger - INFO - Using 1 GPUs!
2025-04-26 15:06:23,006 - sssd.utils.logger - INFO - Current time: 2025-04-26 15:06:23
2025-04-26 15:06:33,472 - sssd.utils.logger - INFO - The 1th inference trial
2025-04-26 15:06:33,474 - sssd.utils.logger - INFO - Output directory: ./results/checkpoint/rbm/T200_beta00.0001_betaT0.02/max
2025-04-26 15:06:34,008 - sssd.utils.logger - INFO - Successfully loaded model at iteration 10000
2025-04-26 15:07:03,308 - sssd.utils.logger - INFO - Average MSE: 13.990039825439453
2025-04-26 15:07:03,308 - sssd.utils.logger - INFO - Average MAPE: 8490500730388480.0
2025-04-26 15:07:03,308 - sssd.utils.logger - INFO - Current time: 2025-04-26 15:07:03
Inference Job completed
(sssd) u6025091@iq0l0ttest1140426-9bhg4:~/SSSD_CP$ ./scripts/diffusion/inference_job.sh -m configs/model.yaml -i configs/inference.yaml
Intializing conda
Activating Conda Env: sssd
[Execution - Inference]
/home/u6025091/SSSD_CP/scripts/diffusion/infer.py --model_config configs/model.yaml --inference_config configs/inference.yaml
2025-04-26 15:07:19,904 - sssd.utils.logger - INFO - Using 1 GPUs!
2025-04-26 15:07:20,081 - sssd.utils.logger - INFO - Current time: 2025-04-26 15:07:20
2025-04-26 15:07:30,561 - sssd.utils.logger - INFO - The 1th inference trial
2025-04-26 15:07:30,562 - sssd.utils.logger - INFO - Output directory: ./results/checkpoint/bm/T200_beta00.0001_betaT0.02/max
2025-04-26 15:07:30,889 - sssd.utils.logger - INFO - Successfully loaded model at iteration 10000
2025-04-26 15:08:00,273 - sssd.utils.logger - INFO - Average MSE: 14.108148574829102
2025-04-26 15:08:00,273 - sssd.utils.logger - INFO - Average MAPE: 1.3516574539382784e+16
2025-04-26 15:08:00,273 - sssd.utils.logger - INFO - Current time: 2025-04-26 15:08:00
Inference Job completed
(sssd) u6025091@iq0l0ttest1140426-9bhg4:~/SSSD_CP$ ./scripts/diffusion/inference_job.sh -m configs/model.yaml -i configs/inference.yaml
Intializing conda
Activating Conda Env: sssd
[Execution - Inference]
/home/u6025091/SSSD_CP/scripts/diffusion/infer.py --model_config configs/model.yaml --inference_config configs/inference.yaml
2025-04-26 15:08:14,197 - sssd.utils.logger - INFO - Using 1 GPUs!
2025-04-26 15:08:14,366 - sssd.utils.logger - INFO - Current time: 2025-04-26 15:08:14
2025-04-26 15:08:24,075 - sssd.utils.logger - INFO - The 1th inference trial
2025-04-26 15:08:24,076 - sssd.utils.logger - INFO - Output directory: ./results/checkpoint/tf/T200_beta00.0001_betaT0.02/max
2025-04-26 15:08:24,428 - sssd.utils.logger - INFO - Successfully loaded model at iteration 10000
2025-04-26 15:08:53,786 - sssd.utils.logger - INFO - Average MSE: 14.107381820678711
2025-04-26 15:08:53,786 - sssd.utils.logger - INFO - Average MAPE: 1.3518593174011904e+16
2025-04-26 15:08:53,786 - sssd.utils.logger - INFO - Current time: 2025-04-26 15:08:53
Inference Job completed

https://raw.githubusercontent.com/Josh-test-lab/website-assets-repository/refs/heads/main/posts/1140429%20meeting/1745656856441.png
成功執行的畫面。

填補結果分析

這裡使用以下程式碼分析與繪圖。在以下圖片中,灰色為無缺值的測試集資料,紅色為送入填補的資料,橘色為填補結果。

載入模組與定義函數。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# test imputation
import os
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

test_data = np.load(r'real_time\pollutants_test.npy').transpose(0, 2, 1)

## functions
def imputation_plot(full_data, missing_data, imputation_data, title, save_path):
    """
    Plot the first 3 dimensions of the first sample, comparing full data and missing data.
    """
    show_dims = 2
    fig, axes = plt.subplots(show_dims, 1, figsize=(12, 8), sharex=True)
    #imputation_data = np.where(np.isnan(missing_data), imputation_data, np.nan)
    for j in range(show_dims):
        axes[j].plot(full_data[0, j], color='gray', label='full data', alpha=0.6)
        axes[j].plot(imputation_data[0, j], color='orange', label='imputation data', alpha=0.6)
        axes[j].plot(missing_data[0, j], color='red', label=title)
        axes[j].set_ylabel(f'Dim {j}')
        axes[j].legend()

    plt.suptitle(title)
    plt.xlabel('Time')
    plt.tight_layout()
    plt.savefig(f"{save_path}.png", dpi=300)
    #plt.show()

def imputation_plot_each_dim(full_data, missing_data, imputation_data, title, save_dir):
    """
    Plot each dimension of the first sample separately, comparing full data, missing data, and imputation data.
    Save each plot as a separate file.
    """
    num_dims = full_data.shape[1]
    os.makedirs(save_dir, exist_ok=True)  # 確保保存目錄存在

    for j in tqdm(range(num_dims)):
        fig, ax = plt.subplots(figsize=(12, 4))
        ax.plot(full_data[0, j], color='gray', label='full data', alpha=0.6)
        ax.plot(imputation_data[0, j], color='orange', label='imputation data', alpha=0.6)
        ax.plot(missing_data[0, j], color='red', label=title)
        ax.set_ylabel(f'Dim {j}')
        ax.set_xlabel('Time')
        ax.set_title(f'{title} - Dimension {j}')

        # 圖例放到圖外
        ax.legend(
            loc='center left',           # 圖例在圖的左側中央(搭配下面這行)
            bbox_to_anchor=(1, 0.5)       # (x, y):x=1是圖的最右邊,往右推一點
        )

        fig.tight_layout(rect=[0, 0, 0.85, 1])  # 調整畫布大小,右邊留空給圖例
        save_path = os.path.join(save_dir, f"{title}_dim{j}.png")
        plt.savefig(save_path, dpi=300)
        plt.close(fig)  # 不然畫一堆圖會記憶體爆掉

rm

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
## rm
filename = r'real_time\imputation\imputation0_rm.npy'
rm_data = np.load(filename)
print(f'Shape: {rm_data.shape}')
print(f'NAs: {np.isnan(rm_data).sum()}')
# print(rm_data[0, 0])
print(f'MSPE for all: {((test_data - rm_data)**2).mean()}')

test_data_rm = np.load(r'real_time\pollutants_test_rm.npy').transpose(0, 2, 1)
print(f'MSPE only for missing: {((test_data[np.isnan(test_data_rm)] - rm_data[np.isnan(test_data_rm)])**2).mean()}')

imputation_plot(test_data, test_data_rm, rm_data, 'imputation rm test data', r'real_time\imputation\imputation0_rm')
imputation_plot_each_dim(test_data, test_data_rm, rm_data, 'imputation rm test data', r'real_time\imputation\imputation0_rm\each_dim')
# test_data[0, 0][1:10]
# test_data_rm[0, 0][1:10]
# rm_data[0, 0][1:10]
執行結果參考
1
2
3
4
5
Shape: (5, 26, 500)
NAs: 0
MSPE for all: 5.630354000694568
MSPE only for missing: 7.2555356208301784
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:13<00:00,  1.95it/s]

https://raw.githubusercontent.com/Josh-test-lab/website-assets-repository/refs/heads/main/posts/1140429%20meeting/result/imputation0_rm.png
imputation rm 。

所有測站填補情形如下。

gallery_made_with_nanogallery2-1-rm

rbm

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
## rbm
filename = r'real_time\imputation\imputation0_rbm.npy'
rbm_data = np.load(filename)
print(f'Shape: {rbm_data.shape}')
print(f'NAs: {np.isnan(rbm_data).sum()}')
# print(rbm_data[0, 0])
print(f'MSPE for all: {((test_data - rbm_data)**2).mean()}')

test_data_rbm = np.load(r'real_time\pollutants_test_rbm.npy').transpose(0, 2, 1)
print(f'MSPE only for missing: {((test_data[np.isnan(test_data_rbm)] - rbm_data[np.isnan(test_data_rbm)])**2).mean()}')

imputation_plot(test_data, test_data_rbm, rbm_data, 'imputation rbm test data', r'real_time\imputation\imputation0_rbm')
imputation_plot_each_dim(test_data, test_data_rbm, rbm_data, 'imputation rbm test data', r'real_time\imputation\imputation0_rbm\each_dim')
# test_data[0, 0][1:10]
# test_data_rbm[0, 0][1:10]
# rbm_data[0, 0][1:10]
執行結果參考
1
2
3
4
5
Shape: (5, 26, 500)
NAs: 0
MSPE for all: 5.67021524531403
MSPE only for missing: 8.879126539941137
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:13<00:00,  1.97it/s] 

https://raw.githubusercontent.com/Josh-test-lab/website-assets-repository/refs/heads/main/posts/1140429%20meeting/result/imputation0_rbm.png
imputation rbm 。

所有測站填補情形如下。

gallery_made_with_nanogallery2-2-rbm

bm

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
## bm
filename = r'real_time\imputation\imputation0_bm.npy'
bm_data = np.load(filename)
print(f'Shape: {bm_data.shape}')
print(f'NAs: {np.isnan(bm_data).sum()}')
# print(bm_data[0, 0])
print(f'MSPE for all: {((test_data - bm_data)**2).mean()}')

test_data_bm = np.load(r'real_time\pollutants_test_bm.npy').transpose(0, 2, 1)
print(f'MSPE only for missing: {((test_data[np.isnan(test_data_bm)] - bm_data[np.isnan(test_data_bm)])**2).mean()}')

imputation_plot(test_data, test_data_bm, bm_data, 'imputation bm test data', r'real_time\imputation\imputation0_bm')
imputation_plot_each_dim(test_data, test_data_bm, bm_data, 'imputation bm test data', r'real_time\imputation\imputation0_bm\each_dim')
# test_data[0, 0][1:10]
# test_data_bm[0, 0][1:10]
# bm_data[0, 0][1:10]
執行結果參考
1
2
3
4
5
Shape: (5, 26, 500)
NAs: 0
MSPE for all: 5.733927828240412
MSPE only for missing: 14.33481884698311
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:13<00:00,  1.95it/s]

https://raw.githubusercontent.com/Josh-test-lab/website-assets-repository/refs/heads/main/posts/1140429%20meeting/result/imputation0_bm.png
imputation bm 。

所有測站填補情形如下。

gallery_made_with_nanogallery2-3-bm

tf

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
## tf
filename = r'real_time\imputation\imputation0_tf.npy'
tf_data = np.load(filename)
print(f'Shape: {tf_data.shape}')
print(f'NAs: {np.isnan(tf_data).sum()}')
# print(tf_data[0, 0])
print(f'MSPE for all: {((test_data - tf_data)**2).mean()}')

test_data_tf = np.load(r'real_time\pollutants_test_tf.npy').transpose(0, 2, 1)
print(f'MSPE only for missing: {((test_data[np.isnan(test_data_tf)] - tf_data[np.isnan(test_data_tf)])**2).mean()}')

imputation_plot(test_data, test_data_tf, tf_data, 'imputation tf test data', r'real_time\imputation\imputation0_tf')
imputation_plot_each_dim(test_data, test_data_tf, tf_data, 'imputation tf test data', r'real_time\imputation\imputation0_tf\each_dim')
# test_data[0, 0][1:10]
# test_data_tf[0, 0][1:10]
# tf_data[0, 0][1:10]
執行結果參考
1
2
3
4
5
Shape: (5, 26, 500)
NAs: 0
MSPE for all: 5.728256563966483
MSPE only for missing: 14.320640763169513
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:13<00:00,  1.94it/s]

https://raw.githubusercontent.com/Josh-test-lab/website-assets-repository/refs/heads/main/posts/1140429%20meeting/result/imputation0_tf.png
imputation tf 。

所有測站填補情形如下。

gallery_made_with_nanogallery2-4-tf

結論

雖然成功進行填補,但似乎成功的失敗了?!雖然每一個填補的結果看起來都像是白躁聲,但是從 rbm ,我們可以看到部分缺值確實有好好填補完成,且預測與實際值接近(遮住灰色的真值);而 tfbm 的非缺值的預測確實也跟真值一樣,但 rmrbm 則是非缺值部分出現預測失當,這十分不合理(因為已設定 only_generate_missing: true)。

目前尚未找出為何會出現此情況,待進一步研究。

https://raw.githubusercontent.com/Josh-test-lab/website-assets-repository/refs/heads/main/posts/1140429%20meeting/To%20be%20continued.jpg
To be continued.