LMTE/main.py at master · Y-debug-sys/LMTE · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
import os
import torch
import argparse
import numpy as np

from torch import optim
from accelerate import Accelerator, DeepSpeedPlugin, DistributedDataParallelKwargs

from src.model import LmteModel
from src.solver import LmteSolver
from src.utils.set_seed import set_seed
from src.build_dataloader import build_dataloader
from src.utils.read_data import read_paths_from_file, read_graph_from_json
from src.utils.useful_functions import compute_ksp_paths, get_paths_to_edges, get_commodities_to_paths


def parse_args():
    """
    Parse command-line arguments for the traffic engineering model.

    Returns:
        Parsed arguments object containing all configuration parameters
    """
    # Initialize argument parser with description
    parser = argparse.ArgumentParser(description='Main Script')
    parser.add_argument('--seed', type=int, default=2025, help='random seed')

    # Topology-related arguments
    parser.add_argument("--topology", type=str, default='GEANT',
                        choices=['Abilene', 'GEANT', 'CERNET', 'UsCarrier', 'Cogentco'],
                        help="Name of the topology to be used.")
    parser.add_argument("--topology_filepath", type=str, default='./data/GEANT/topology.json',
                        help="Name of .json file the topology was stored.")
    parser.add_argument("--tm_filepath", type=str, default='./data/GEANT/GEANT.csv',
                        help="Name of .csv file the traffic matrices were stored.")

    # Training-related arguments
    parser.add_argument('--num_itrs', type=int, default=1, help='experiments times')
    parser.add_argument('--train_epochs', type=int, default=10, help='train epochs')
    parser.add_argument('--batch_size', type=int, default=32, help='batch size of train input data')
    parser.add_argument('--eval_batch_size', type=int, default=1, help='batch size of model evaluation')
    parser.add_argument('--patience', type=int, default=3, help='early stopping patience')
    parser.add_argument('--learning_rate', type=float, default=1e-3, help='optimizer learning rate')

    # Experiment status and configuration arguments
    parser.add_argument('--is_training', type=int, default=1, help='status')
    parser.add_argument("--num_paths", type=int, default=8,
                        help="Number of optimized tunnels per OD pair for searching.")
    parser.add_argument('--window_size', type=int, default=12,
                        help='history traffic matrix sequence length')
    parser.add_argument("--scale", type=int, default=10**9, help="Normalized scale.")
    parser.add_argument('--checkpoint_path', type=str, default='./checkpoints/',
                        help='location of model checkpoints')
    parser.add_argument('--result_path', type=str, default='./results/',
                        help='location of computed mlus')

    # Model architecture arguments
    parser.add_argument('--llm_dim', type=int, default='4096', help='LLM model dimension')
    parser.add_argument('--llm_layers', type=int, default=1, help='LLM model layers')
    parser.add_argument('--d_keys', type=int, default=32, help='dimension of fcn')
    parser.add_argument('--n_heads', type=int, default=4, help='num of heads')
    parser.add_argument('--d_model', type=int, default=32, help='dimension of model')
    parser.add_argument('--num_gnn_layers', type=int, default=3, help='num of GNN layers')
    parser.add_argument('--num_rnn_layers', type=int, default=2, help='num of RNN layers')
    parser.add_argument('--num_dnn_layers', type=int, default=4, help='num of Dense layers')
    parser.add_argument('--dropout', type=float, default=0., help='dropout')
    parser.add_argument('--llm_model', type=str, default='llama-8b',
                        help='model name, options: [llama-8b, llama-3b, llama-1b, llama-13b]')
    parser.add_argument('--is-lightweight', dest="is_divide", action='store_true',
                        help='whether to use route-aware implementation')

    # Failure and burst testing arguments
    parser.add_argument('--add_failures', type=int, default=0,
                        help='whether to add failures during testing')
    parser.add_argument('--num_failures', type=int, default=1,
                        help='number of failures to add during testing')
    parser.add_argument('--add_bursts', type=int, default=0,
                        help='whether to add bursts during testing')
    parser.add_argument('--burst_factor', type=float, default=10.,
                        help='burst factor for traffic matrix fluctuations')

    args = parser.parse_args()
    return args


if __name__ == '__main__':
    """
    Main execution function for the traffic engineering model.
    This function handles:
    1. Parsing command-line arguments
    2. Setting up the training environment with Accelerate
    3. Loading topology and traffic matrix data
    4. Computing or loading path information
    5. Setting up model, optimizer, and solver
    6. Training and testing the model
    7. Saving the results
    """
    args = parse_args()
    set_seed(args.seed)  # Set random seed for reproducibility

    # Configure distributed training with Accelerate
    ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)  # DDP config
    deepspeed_plugin = DeepSpeedPlugin(hf_ds_config='./deepspeed_cfg.json')  # DeepSpeed config
    accelerator = Accelerator(kwargs_handlers=[ddp_kwargs], deepspeed_plugin=deepspeed_plugin)  # Accelerator config

    # Load network topology and capacities from file
    topo, capacities = read_graph_from_json(args.topology_filepath)
    num_nodes = len(topo.nodes())  # Number of nodes in the network
    num_edges = len(topo.edges())  # Number of edges in the network
    num_paths = args.num_paths  # Number of paths per OD pair

    # Compute or load tunnel paths for the network
    tunnels_filename = f'./data/{args.topology}/paths.txt'
    if not os.path.exists(tunnels_filename):
        # Generate k-shortest paths for all OD pairs if not already computed
        pairs = [(i, j) for i in range(num_nodes) for j in range(num_nodes) if i != j]
        _ = compute_ksp_paths(k=args.num_paths, pairs=pairs, graph=topo, save2txt=True,
                              filepath=f'./data/{args.topology}', transform=True)

    # Load the computed paths
    paths = read_paths_from_file(filepath=tunnels_filename, num_nodes=num_nodes, convert=True)

    # Create paths-to-edges matrix as a sparse tensor
    p_matrix = get_paths_to_edges(topo, paths=paths)
    pm_coo = p_matrix.tocoo()
    paths_to_edges = torch.sparse_coo_tensor(np.vstack((pm_coo.row, pm_coo.col)), \
                                             torch.FloatTensor(pm_coo.data),
                                             torch.Size(pm_coo.shape))

    # Create commodities-to-paths matrix as a sparse tensor
    c_matrix = get_commodities_to_paths(topo, num_paths=p_matrix.shape[0], paths=paths)
    cm_coo = c_matrix.tocoo()
    commodities_to_paths = torch.sparse_coo_tensor(np.vstack((cm_coo.row, cm_coo.col)), \
                                                   torch.FloatTensor(cm_coo.data),
                                                   torch.Size(cm_coo.shape))

    results = []  # Store results from multiple iterations

    # Run multiple iterations if specified
    for ii in range(args.num_itrs):
        setting = '{}_lmte'.format(args.topology)

        # Build data loaders for training, validation, and testing
        train_loader, valid_loader, test_loader = build_dataloader(args.topology_filepath, args.tm_filepath, args.batch_size,
                                                                   args.scale, args.eval_batch_size, args.window_size,
                                                                   split_ratio=(0.7, 0.1, 0.2))

        # Get the maximum path length for model configuration
        max_path_length = train_loader.dataset.get_padded_edge_ids_per_path(paths).shape[-1]

        # Initialize the LMTE model with specified parameters
        model = LmteModel(args.d_model, args.llm_dim, num_nodes, num_paths, num_edges, args.num_gnn_layers, args.num_rnn_layers,
                          args.num_dnn_layers, args.d_keys, args.window_size, args.n_heads, args.dropout, d_middle=args.d_model,
                          llm_layers=args.llm_layers, llm_model=args.llm_model, max_length=max_path_length,
                          use_divide_head=args.is_divide).float()

        # Collect parameters that require training
        trained_parameters = []
        for p in model.parameters():
            if p.requires_grad is True:
                trained_parameters.append(p)

        # Initialize optimizer with model parameters and learning rate
        optimizer = optim.Adam(trained_parameters, lr=args.learning_rate)
        # Prepare all components for distributed training with Accelerate
        train_loader, valid_loader, test_loader, model, optimizer = accelerator.prepare(train_loader, valid_loader, test_loader, model, optimizer)

        # Set up model save path for this iteration
        model_save_path = os.path.join(args.checkpoint_path, setting+f'_{ii}')
        if not os.path.exists(model_save_path):
            os.makedirs(model_save_path)  # Create directory if it doesn't exist

        # Initialize the solver with all required components
        solver = LmteSolver(args, model, topo, accelerator, train_loader, valid_loader, commodities_to_paths, paths_to_edges,
                            optimizer, paths)

        if args.is_training:
            # Train the model if in training mode
            solver.adapt(model_save_path, auto_fail=True, auto_burst=True)
        else:
            # Load a pre-trained model if in testing mode only
            solver.load_checkpoint(model_save_path)

        # Test the model and collect results
        results_ii = solver.test(test_loader, args.add_failures, args.num_failures, args.add_bursts, args.burst_factor)

        results.append(results_ii)

    # Prepare directory and save results
    results_save_path = os.path.join(args.result_path, setting)
    if not os.path.exists(results_save_path):
        os.makedirs(results_save_path)

    # Determine the appropriate filename based on whether failures or bursts were added
    if args.add_failures:
        save_filename = '/mlu_results_f.npy'
    elif args.add_bursts:
        save_filename = '/mlu_results_b.npy'
    else:
        save_filename = '/mlu_results.npy'

    # Save the results as a numpy array
    np.save(results_save_path + save_filename, np.array(results))