# Training configurationbatch_size:1# Batch sizeoutput_directory:"./results/real_time"# Output directory for checkpoints and logsckpt_iter:"max"# Checkpoint mode (max or min)iters_per_ckpt:1000# Checkpoint frequency (number of epochs)iters_per_logging:100# Log frequency (number of iterations)n_iters:20000# Maximum number of iterationslearning_rate:0.001# Learning rate# Additional training settingsonly_generate_missing:true# Generate missing values onlyuse_model:2# Model to use for trainingmasking:"rm"# Masking strategy for missing valuesmissing_k:200# Number of missing values# Data pathsdata:train_path:"./datasets/real_time/pollutants_train.npy"# Path to training data
wavenet:# WaveNet model parametersinput_channels:26# Number of input channelsoutput_channels:26# Number of output channelsresidual_layers:36# Number of residual layersresidual_channels:256# Number of channels in residual blocksskip_channels:256# Number of channels in skip connections# Diffusion step embedding dimensionsdiffusion_step_embed_dim_input:128# Input dimensiondiffusion_step_embed_dim_hidden:512# Middle dimensiondiffusion_step_embed_dim_output:512# Output dimension# Structured State Spaces sequence model (S4) configurationss4_max_sequence_length:2000# Maximum sequence lengths4_state_dim:64# State dimensions4_dropout:0.0# Dropout rates4_bidirectional:true# Whether to use bidirectional layerss4_use_layer_norm:true# Whether to use layer normalizationdiffusion:# Diffusion model parametersT:200# Number of diffusion stepsbeta_0:0.0001# Initial beta valuebeta_T:0.02# Final beta value
inference.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# Inference configurationbatch_size:1# Batch size for inferenceoutput_directory:"./results/real_time/12/inference/mnr"# Output directory for inference resultsckpt_path:"./results/real_time"# Path to checkpoint for inferencetrials:1# Replications# Additional training settingsonly_generate_missing:true# Generate missing values onlyuse_model:2# Model to use for trainingmasking:"mnr"# Masking strategy for missing valuesmissing_k:12# Number of missing values# Data pathsdata:test_path:"./datasets/real_time/pollutants_test.npy"# Path to test data
defget_dataloader(path:str,batch_size:int,is_shuffle:bool=True,device:Union[str,torch.device]="cpu",num_workers:int=0,)->DataLoader:"""
Get a PyTorch DataLoader for the dataset stored at the given path.
Args:
path (str): Path to the dataset file.
batch_size (int): Size of each batch.
is_shuffle (bool, optional): Whether to shuffle the dataset. Defaults to True.
device (Union[str, torch.device], optional): Device to move the data to. Defaults to "cpu".
num_workers (int, optional): Number of subprocesses to use for data loading. Defaults to 8.
Returns:
DataLoader: PyTorch DataLoader for the dataset.
"""dataset=TensorDataset(torch.from_numpy(np.load(path)).to(dtype=torch.float32))pin_memory=device=="cuda"ordevice==torch.device("cuda")returnDataLoader(dataset,batch_size=batch_size,shuffle=is_shuffle,pin_memory=pin_memory,num_workers=num_workers,)
# test imputationimportosimportnumpyasnpimportmatplotlib.pyplotaspltfromtqdmimporttqdmtest_data=np.load(r'real_time\pollutants_test.npy').transpose(0,2,1)## functionsdefimputation_plot(full_data,missing_data,imputation_data,title,save_path):"""
Plot the first 3 dimensions of the first sample, comparing full data and missing data.
"""os.makedirs(save_path,exist_ok=True)# 確保保存目錄存在show_dims=2fig,axes=plt.subplots(show_dims,1,figsize=(12,8),sharex=True)#imputation_data = np.where(np.isnan(missing_data), imputation_data, np.nan)forjinrange(show_dims):axes[j].plot(full_data[0,j],color='gray',label='full data',alpha=0.6)axes[j].plot(imputation_data[0,j],color='orange',label='imputation data',alpha=0.6)axes[j].plot(missing_data[0,j],color='red',label=title)axes[j].set_ylabel(f'Dim {j}')axes[j].legend()plt.suptitle(title)plt.xlabel('Time')plt.tight_layout()plt.savefig(f"{save_path}.png",dpi=300)#plt.show()defimputation_plot_each_dim(full_data,missing_data,imputation_data,title,save_dir):"""
Plot each dimension of the first sample separately, comparing full data, missing data, and imputation data.
Save each plot as a separate file.
"""num_dims=full_data.shape[1]os.makedirs(save_dir,exist_ok=True)# 確保保存目錄存在forjintqdm(range(num_dims)):fig,ax=plt.subplots(figsize=(12,4))ax.plot(full_data[0,j],color='gray',label='full data',alpha=0.6)ax.plot(imputation_data[0,j],color='orange',label='imputation data',alpha=0.6)ax.plot(missing_data[0,j],color='red',label=title)ax.set_ylabel(f'Dim {j}')ax.set_xlabel('Time')ax.set_title(f'{title} - Dimension {j}')# 圖例放到圖外ax.legend(loc='center left',# 圖例在圖的左側中央(搭配下面這行)bbox_to_anchor=(1,0.5)# (x, y):x=1是圖的最右邊,往右推一點)fig.tight_layout(rect=[0,0,0.85,1])# 調整畫布大小,右邊留空給圖例save_path=os.path.join(save_dir,f"{title}_dim{j}.png")plt.savefig(save_path,dpi=300)plt.close(fig)# 不然畫一堆圖會記憶體爆掉defanalysis_predict_data(missing_k,path):folder_path=f'.\\real_time\\imputation\\{missing_k}\\predict\\{path}\\T200_beta00.0001_betaT0.02\\max'os.makedirs(folder_path,exist_ok=True)length=len(os.listdir(folder_path))imputation=[]foriinrange(length):file_path=os.path.join(folder_path,f'imputation{i}.npy')arr=np.load(file_path)imputation.append(arr)predict_data=np.concatenate(imputation,axis=0)print(f'Shape: {predict_data.shape}')print(f'NAs: {np.isnan(predict_data).sum()}')# print(predict_data[0, 0])print(f'MSPE for all: {((test_data-predict_data)**2).mean()}')test_data_predict=np.load(f'real_time\\{missing_k}\\pollutants_test_{path}.npy').transpose(0,2,1)print(f'MSPE only for missing: {((test_data[np.isnan(test_data_predict)]-predict_data[np.isnan(test_data_predict)])**2).mean()}')imputation_plot(test_data,test_data_predict,predict_data,f'imputation {path} test data',f'real_time\\imputation\\{missing_k}\\result\\predict\\imputation0_{path}')imputation_plot_each_dim(test_data,test_data_predict,predict_data,f'imputation {path} test data',f'real_time\\imputation\\{missing_k}\\result\\predict\\imputation0_{path}')# test_data[0, 0][1:10]# test_data_predict[0, 0][1:10]# predict_data[0, 0][1:10]missing_k=200## rmanalysis_predict_data(missing_k,'rm')## rbmanalysis_predict_data(missing_k,'rbm')## bmanalysis_predict_data(missing_k,'bm')## tfanalysis_predict_data(missing_k,'tf')# originaldefanalysis_imputation_data(missing_k,path):folder_path=f'.\\real_time\\imputation\\{missing_k}\\inference\\{path}\\T200_beta00.0001_betaT0.02\\max'os.makedirs(folder_path,exist_ok=True)length=len(os.listdir(folder_path))imputation=[]foriinrange(length):file_path=os.path.join(folder_path,f'imputation{i}.npy')arr=np.load(file_path)imputation.append(arr)imputation_data=np.concatenate(imputation,axis=0)print(f'Shape: {imputation_data.shape}')print(f'NAs: {np.isnan(imputation_data).sum()}')# print(imputation_data[0, 0])print(f'MSPE: {((test_data-imputation_data)**2).mean()}')imputation_plot(test_data,test_data,imputation_data,f'imputation {path} test data',f'real_time\\imputation\\{missing_k}\\result\\original\\imputation0_{path}')imputation_plot_each_dim(test_data,test_data,imputation_data,f'imputation {path} test data',f'real_time\\imputation\\{missing_k}\\result\\original\\imputation0_{path}')# test_data[0, 0][1:10]# test_data_imputation[0, 0][1:10]# imputation_data[0, 0][1:10]## rmanalysis_imputation_data(missing_k,'rm')## bmanalysis_imputation_data(missing_k,'bm')## mnranalysis_imputation_data(missing_k,'mnr')## tfanalysis_imputation_data(missing_k,'tf')
Juan Lopez Alcaraz 、 Nils Strodthoff(2022)。Diffusion-based time series imputation and forecasting with structured state space models。Transactions on Machine Learning Research。參考自 https://openreview.net/forum?id=hHiIbk7ApW