From 360b795fac98251ba630b3b36d4b960521e84cae Mon Sep 17 00:00:00 2001 From: ethereal Date: Wed, 18 Jun 2025 15:22:17 +0800 Subject: [PATCH 01/27] doc(docker): add docker usage into README files; add a docker-compose file --- vermeer/README.md | 22 ++++++++++++++++++++++ vermeer/README.zh-CN.md | 20 ++++++++++++++++++++ vermeer/docker-compose.yaml | 29 +++++++++++++++++++++++++++++ 3 files changed, 71 insertions(+) create mode 100644 vermeer/docker-compose.yaml diff --git a/vermeer/README.md b/vermeer/README.md index 776956625..55ca14b02 100644 --- a/vermeer/README.md +++ b/vermeer/README.md @@ -3,6 +3,28 @@ ## Introduction Vermeer is a high-performance distributed graph computing platform based on memory, supporting more than 15 graph algorithms, custom algorithm extensions, and custom data source access. +## Run with Docker + +Pull the image: +``` +docker pull hugegraph/vermeer:latest +``` + +Create local configuration files, for example, `~/master.ini` and `~/worker.ini`. + +Run with Docker. The `--env` flag specifies the file name. + +``` +master: docker run -v ~/:/go/bin/config hugegraph/vermeer --env=master +worker: docker run -v ~/:/go/bin/config hugegraph/vermeer --env=worker +``` + +We've also provided a `docker-compose` file. Once you've created `~/master.ini` and `~/worker.ini`, and updated the `master_peer` in `worker.ini` to `172.20.0.10:6689`, you can run it using the following command: + +``` +docker-compose up -d +``` + ## Start ``` diff --git a/vermeer/README.zh-CN.md b/vermeer/README.zh-CN.md index 34dcec04c..1b125fa38 100644 --- a/vermeer/README.zh-CN.md +++ b/vermeer/README.zh-CN.md @@ -3,6 +3,26 @@ ## 简介 Vermeer是一个基于内存的高性能分布式图计算平台,支持15+图算法。支持自定义算法扩展,支持自定义数据源接入。 +## 基于 Docker 运行 + +拉取镜像 +``` +docker pull hugegraph/vermeer:latest +``` + +创建好本地配置文件,例如`~/master.ini`与`~/worker.ini` + +基于docker运行,其中`--env`指定的是文件名称。 +``` +master: docker run -v ~/:/go/bin/config hugegraph/vermeer --env=master +worker: docker run -v ~/:/go/bin/config hugegraph/vermeer --env=worker +``` + +我们也提供了`docker-compose`文件,当创建好`~/master.ini`与`~/worker.ini`,将`worker.ini`中的`master_peer`修改为`172.20.0.10:6689`后,即可通过以下命令运行: +``` +docker-compose up -d +``` + ## 运行 ``` diff --git a/vermeer/docker-compose.yaml b/vermeer/docker-compose.yaml new file mode 100644 index 000000000..35a506170 --- /dev/null +++ b/vermeer/docker-compose.yaml @@ -0,0 +1,29 @@ +version: '3.8' + +services: + vermeer-master: + image: hugegraph/vermeer + container_name: vermeer-master + volumes: + - ~/:/go/bin/config # Change here to your actual config path + command: --env=master + networks: + vermeer_network: + ipv4_address: 172.20.0.10 # Assign a static IP for the master + + vermeer-worker: + image: hugegraph/vermeer + container_name: vermeer-worker + volumes: + - ~/:/go/bin/config # Change here to your actual config path + command: --env=worker + networks: + vermeer_network: + ipv4_address: 172.20.0.11 # Assign a static IP for the worker + +networks: + vermeer_network: + driver: bridge + ipam: + config: + - subnet: 172.20.0.0/24 # Define the subnet for your network \ No newline at end of file From 09dab68ec81fee7c2e4c7d523fce76eb0d56e848 Mon Sep 17 00:00:00 2001 From: ethereal Date: Sat, 19 Jul 2025 23:59:08 +0800 Subject: [PATCH 02/27] chore: add test case into gitignore --- vermeer/.gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vermeer/.gitignore b/vermeer/.gitignore index 540a67ae4..e9502ea66 100644 --- a/vermeer/.gitignore +++ b/vermeer/.gitignore @@ -83,3 +83,7 @@ node_modules/ /output/ /bin/* !/bin/*.sh + +# 其他 # +###################### +test/case/ From 318c0e872ec76215d8c1e43d4078f95dcb5e2a67 Mon Sep 17 00:00:00 2001 From: ethereal Date: Sun, 20 Jul 2025 01:18:24 +0800 Subject: [PATCH 03/27] chore: tmp framework --- .../apps/master/schedules/resource_manager.go | 39 +++++++++++++++++++ .../master/schedules/scheduler_manager.go | 30 ++++++++++++++ 2 files changed, 69 insertions(+) create mode 100644 vermeer/apps/master/schedules/resource_manager.go create mode 100644 vermeer/apps/master/schedules/scheduler_manager.go diff --git a/vermeer/apps/master/schedules/resource_manager.go b/vermeer/apps/master/schedules/resource_manager.go new file mode 100644 index 000000000..fb178e2d5 --- /dev/null +++ b/vermeer/apps/master/schedules/resource_manager.go @@ -0,0 +1,39 @@ +package schedules + +type WorkerOngoingStatus string + +const ( + WorkerOngoingStatusIdle WorkerOngoingStatus = "idle" + WorkerOngoingStatusRunning WorkerOngoingStatus = "running" + WorkerOngoingStatusPaused WorkerOngoingStatus = "paused" +) + +type ResourceManager struct { + workerStatus map[string]WorkerOngoingStatus +} + +func (rm *ResourceManager) Init() { + rm.workerStatus = make(map[string]WorkerOngoingStatus) +} + +func (rm *ResourceManager) Lock() { + // Implement locking logic if necessary +} + +func (rm *ResourceManager) Unlock() { + // Implement unlocking logic if necessary +} + +func (rm *ResourceManager) ReleaseByTaskID(taskID int32) { + rm.Lock() + defer rm.Unlock() + + for worker, status := range rm.workerStatus { + if status == WorkerOngoingStatusRunning && rm.isTaskRunningOnWorker(worker, taskID) { + delete(rm.workerStatus, worker) + break + } + } +} + +func (rm *ResourceManager) GetWorkerGroupStatus() {} diff --git a/vermeer/apps/master/schedules/scheduler_manager.go b/vermeer/apps/master/schedules/scheduler_manager.go new file mode 100644 index 000000000..bb16bbf38 --- /dev/null +++ b/vermeer/apps/master/schedules/scheduler_manager.go @@ -0,0 +1,30 @@ +package schedules + +import "vermeer/apps/structure" + +type SchedulerManager struct { + // resource management + // algorithm management + // task management +} + +func (s *SchedulerManager) Init() *SchedulerManager { + return &SchedulerManager{} +} + +func (s *SchedulerManager) ReleaseByTaskID(taskID int32) { + // Implement logic to release resources by task ID + + // trace tasks need these workers, check if these tasks are available +} + +func (s *SchedulerManager) GetNextTask(spaceName string) []*structure.TaskInfo { + // Implement logic to get the next task in the queue for the given space + + // step 1: make sure all tasks have alloc to a worker group + + // step 2: sort available tasks by priority (by calling algorithm's GetNextTask method) + + // step 3: return the task with the highest priority or small tasks which can be executed immediately + return nil +} From 8a7e1520a048b42293a3b594969a9611ebc73f6e Mon Sep 17 00:00:00 2001 From: ethereal Date: Mon, 21 Jul 2025 00:33:35 +0800 Subject: [PATCH 04/27] chore: reorgnize --- vermeer/apps/master/bl/scheduler_bl.go | 170 +++++------------- .../master/schedules/algorithm_manager.go | 17 ++ .../apps/master/schedules/resource_manager.go | 68 ++++++- .../master/schedules/scheduler_manager.go | 144 ++++++++++++++- vermeer/apps/master/schedules/task_manager.go | 101 +++++++++++ 5 files changed, 366 insertions(+), 134 deletions(-) create mode 100644 vermeer/apps/master/schedules/algorithm_manager.go create mode 100644 vermeer/apps/master/schedules/task_manager.go diff --git a/vermeer/apps/master/bl/scheduler_bl.go b/vermeer/apps/master/bl/scheduler_bl.go index bc4ccac49..6a0200940 100644 --- a/vermeer/apps/master/bl/scheduler_bl.go +++ b/vermeer/apps/master/bl/scheduler_bl.go @@ -19,9 +19,7 @@ package bl import ( "errors" - "strconv" "time" - "vermeer/apps/common" "vermeer/apps/master/schedules" "vermeer/apps/structure" @@ -30,34 +28,16 @@ import ( type ScheduleBl struct { structure.MutexLocker - dispatchLocker structure.MutexLocker - spaceQueue *schedules.SpaceQueue - broker *schedules.Broker - startChan chan *structure.TaskInfo - isDispatchPaused bool + schedulerManager *schedules.SchedulerManager } func (s *ScheduleBl) Init() { - const defaultChanSizeConfig = "10" - chanSize := common.GetConfigDefault("start_chan_size", defaultChanSizeConfig).(string) - // Convert string to int - chanSizeInt, err := strconv.Atoi(chanSize) - if err != nil { - logrus.Errorf("failed to convert start_chan_size to int: %v", err) - logrus.Infof("using default start_chan_size: %s", defaultChanSizeConfig) - chanSizeInt, _ = strconv.Atoi(defaultChanSizeConfig) - } - startChan := make(chan *structure.TaskInfo, chanSizeInt) - s.startChan = startChan - s.spaceQueue = (&schedules.SpaceQueue{}).Init() - s.broker = (&schedules.Broker{}).Init() - - go s.waitingTask() - go s.startTicker() + s.schedulerManager = &schedules.SchedulerManager{} + s.schedulerManager.Init(taskMgr.SetState, taskMgr.SetError) } func (s *ScheduleBl) PeekSpaceTail(space string) *structure.TaskInfo { - return s.spaceQueue.PeekTailTask(space) + return s.schedulerManager.GetLastTask(space) } // QueueTask Add the task to the inner queue. @@ -78,14 +58,12 @@ func (s *ScheduleBl) QueueTask(taskInfo *structure.TaskInfo) (bool, error) { } // Notice: Ensure successful invocation. - ok, err := s.spaceQueue.PushTask(taskInfo) + ok, err := s.schedulerManager.QueueTask(taskInfo) if err != nil { taskMgr.SetError(taskInfo, err.Error()) return ok, err } - go s.dispatch() - return ok, nil } @@ -94,14 +72,12 @@ func (s *ScheduleBl) CancelTask(taskInfo *structure.TaskInfo) error { return errors.New("the argument `taskInfo` is nil") } - s.Lock() - isHeadTask := s.spaceQueue.IsHeadTask(taskInfo.ID) - task := s.spaceQueue.RemoveTask(taskInfo.ID) - s.Unlock(nil) - + isHeadTask := s.schedulerManager.IsTaskOngoing(taskInfo.ID) + task := s.schedulerManager.RemoveTask(taskInfo.ID) + // err := s.schedulerManager.CancelTask(taskInfo) isInQueue := false if task != nil { - logrus.Infof("removed task '%d' from space queue", task.ID) + logrus.Infof("removed task '%d' from space queue", taskInfo.ID) isInQueue = true } @@ -120,33 +96,53 @@ func (s *ScheduleBl) CancelTask(taskInfo *structure.TaskInfo) error { } func (s *ScheduleBl) IsDispatchPaused() bool { - return s.isDispatchPaused + return s.schedulerManager.IsDispatchPaused() } func (s *ScheduleBl) PauseDispatch() { - s.isDispatchPaused = true + s.schedulerManager.PauseDispatch() } func (s *ScheduleBl) ResumeDispatch() { - s.isDispatchPaused = false + s.schedulerManager.ResumeDispatch() } func (s *ScheduleBl) AllTasksInQueue() []*structure.TaskInfo { - return s.spaceQueue.AllTasks() + return s.schedulerManager.AllTasksInQueue() } func (s *ScheduleBl) TasksInQueue(space string) []*structure.TaskInfo { - return s.spaceQueue.SpaceTasks(space) + return s.schedulerManager.TasksInQueue(space) } func (s *ScheduleBl) CloseCurrent(taskId int32) error { + s.schedulerManager.ReleaseByTaskID(taskId) + logrus.Infof("invoke dispatch when task '%d' is closed", taskId) - s.dispatch() + s.schedulerManager.TryScheduleNextTasks() + return nil +} + +func (s *ScheduleBl) handleCancelTask(taskInfo *structure.TaskInfo) error { + logrus.Infof("received task '%d' to cancel", taskInfo.ID) + canceler, err := NewTaskCanceler(taskInfo) + if err != nil { + logrus.Errorf("failed to create new TaskCanceler err: %v", err) + taskMgr.SetError(taskInfo, err.Error()) + return err + } + + if err := canceler.CancelTask(); err != nil { + logrus.Errorf("failed to cancel task '%d', caused by: %v", taskInfo.ID, err) + taskMgr.SetError(taskInfo, err.Error()) + return err + } return nil } +// now, start task! func (s *ScheduleBl) handleStartTask(taskInfo *structure.TaskInfo) { - agent, status, err := s.broker.ApplyAgent(taskInfo) + agent, status, err := s.schedulerManager.GetAgent(taskInfo) if err != nil { logrus.Errorf("apply agent error: %v", err) @@ -154,14 +150,14 @@ func (s *ScheduleBl) handleStartTask(taskInfo *structure.TaskInfo) { return } - switch status { - case schedules.AgentStatusNoWorker: - fallthrough - case schedules.AgentStatusWorkerNotReady: - logrus.Warnf("failed to apply an agent for task '%d', graph: %s/%s, status: %s", - taskInfo.ID, taskInfo.SpaceName, taskInfo.GraphName, status) - return - } + // switch status { + // case schedules.AgentStatusNoWorker: + // fallthrough + // case schedules.AgentStatusWorkerNotReady: + // logrus.Warnf("failed to apply an agent for task '%d', graph: %s/%s, status: %s", + // taskInfo.ID, taskInfo.SpaceName, taskInfo.GraphName, status) + // return + // } if agent == nil { logrus.Infof("no available agent for task '%d', graph: %s/%s, status: %s", @@ -175,24 +171,6 @@ func (s *ScheduleBl) handleStartTask(taskInfo *structure.TaskInfo) { go s.startWaitingTask(agent, taskInfo) } -func (s *ScheduleBl) handleCancelTask(taskInfo *structure.TaskInfo) error { - logrus.Infof("received task '%d' to cancel", taskInfo.ID) - canceler, err := NewTaskCanceler(taskInfo) - if err != nil { - logrus.Errorf("failed to create new TaskCanceler err: %v", err) - taskMgr.SetError(taskInfo, err.Error()) - return err - } - - if err := canceler.CancelTask(); err != nil { - logrus.Errorf("failed to cancel task '%d', caused by: %v", taskInfo.ID, err) - taskMgr.SetError(taskInfo, err.Error()) - return err - } - - return nil -} - func (s *ScheduleBl) startWaitingTask(agent *schedules.Agent, taskInfo *structure.TaskInfo) { logrus.Infof("starting a task, id: %v, type: %v, graph: %v", taskInfo.ID, taskInfo.Type, taskInfo.GraphName) @@ -226,64 +204,4 @@ func (s *ScheduleBl) startWaitingTask(agent *schedules.Agent, taskInfo *structur logrus.Errorf("failed to start a task, type: %s, taskID: %d, caused by: %v", taskInfo.Type, taskInfo.ID, err) taskMgr.SetError(taskInfo, err.Error()) } - -} - -func (s *ScheduleBl) dispatch() { - defer func() { - if err := recover(); err != nil { - logrus.Errorln("dispatch() has been recovered:", err) - } - }() - - if err := s.doDispatch(); err != nil { - logrus.Errorf("do dispatching error:%v", err) - } -} - -func (s *ScheduleBl) doDispatch() error { - if s.isDispatchPaused { - logrus.Warn("the dispatching was paused") - return nil - } - - defer s.dispatchLocker.Unlock(s.dispatchLocker.Lock()) - - buffer := s.spaceQueue.HeadTasks() - if len(buffer) == 0 { - return nil - } - - for _, task := range buffer { - select { - case s.startChan <- task: - default: - logrus.Warnf("the start channel is full, dropped task: %d", task.ID) - } - - } - - return nil -} - -func (s *ScheduleBl) waitingTask() { - for taskInfo := range s.startChan { - if taskInfo == nil { - logrus.Warnf("recieved a nil task from startChan") - return - } - - logrus.Infof("chan received task '%d' to start", taskInfo.ID) - s.handleStartTask(taskInfo) - } -} - -func (s *ScheduleBl) startTicker() { - // Create a ticker that triggers every 3 seconds - ticker := time.Tick(3 * time.Second) - - for range ticker { - //logrus.Debug("Ticker ticked") - s.dispatch() - } } diff --git a/vermeer/apps/master/schedules/algorithm_manager.go b/vermeer/apps/master/schedules/algorithm_manager.go new file mode 100644 index 000000000..de769cf1c --- /dev/null +++ b/vermeer/apps/master/schedules/algorithm_manager.go @@ -0,0 +1,17 @@ +package schedules + +type Algorithm interface { + // Name returns the name of the algorithm + Name() string + // Execute runs the algorithm with the provided parameters + Execute(params map[string]interface{}) (interface{}, error) + // Validate checks if the provided parameters are valid for the algorithm +} + +type AlgorithmManager struct { + supportedAlgorithms map[string]Algorithm +} + +func (am *AlgorithmManager) Init() { + am.supportedAlgorithms = make(map[string]Algorithm) +} diff --git a/vermeer/apps/master/schedules/resource_manager.go b/vermeer/apps/master/schedules/resource_manager.go index fb178e2d5..415045514 100644 --- a/vermeer/apps/master/schedules/resource_manager.go +++ b/vermeer/apps/master/schedules/resource_manager.go @@ -1,5 +1,10 @@ package schedules +import ( + "errors" + "vermeer/apps/structure" +) + type WorkerOngoingStatus string const ( @@ -10,6 +15,9 @@ const ( type ResourceManager struct { workerStatus map[string]WorkerOngoingStatus + // broker just responsible for communication with workers + // it can not apply tasks to workers directly + broker *Broker } func (rm *ResourceManager) Init() { @@ -36,4 +44,62 @@ func (rm *ResourceManager) ReleaseByTaskID(taskID int32) { } } -func (rm *ResourceManager) GetWorkerGroupStatus() {} +func (rm *ResourceManager) isTaskRunningOnWorker(worker string, taskID int32) bool { + // Implement logic to check if a task is running on a specific worker + // This is a placeholder implementation + return false // Replace with actual logic +} + +func (rm *ResourceManager) GetAgent(taskInfo *structure.TaskInfo) (*Agent, AgentStatus, error) { + if taskInfo == nil { + return nil, AgentStatusError, errors.New("taskInfo is nil") + } + + rm.Lock() + defer rm.Unlock() + + agent, status, err := rm.broker.ApplyAgent(taskInfo) + if err != nil { + return nil, AgentStatusError, err + } + if agent == nil { + return nil, status, nil + } + + // Assign the task to the agent + agent.AssignTask(taskInfo) + + return agent, status, nil +} + +func (rm *ResourceManager) IsDispatchPaused() bool { + rm.Lock() + defer rm.Unlock() + + for _, status := range rm.workerStatus { + if status == WorkerOngoingStatusPaused { + return true + } + } + return false +} + +func (rm *ResourceManager) PauseDispatch() { + rm.Lock() + defer rm.Unlock() + + for worker := range rm.workerStatus { + rm.workerStatus[worker] = WorkerOngoingStatusPaused + } +} + +func (rm *ResourceManager) ResumeDispatch() { + rm.Lock() + defer rm.Unlock() + + for worker := range rm.workerStatus { + if rm.workerStatus[worker] == WorkerOngoingStatusPaused { + rm.workerStatus[worker] = WorkerOngoingStatusIdle + } + } +} diff --git a/vermeer/apps/master/schedules/scheduler_manager.go b/vermeer/apps/master/schedules/scheduler_manager.go index bb16bbf38..9fcd68243 100644 --- a/vermeer/apps/master/schedules/scheduler_manager.go +++ b/vermeer/apps/master/schedules/scheduler_manager.go @@ -1,24 +1,90 @@ package schedules -import "vermeer/apps/structure" +import ( + "strconv" + "time" + "vermeer/apps/common" + "vermeer/apps/structure" + + "github.com/sirupsen/logrus" +) type SchedulerManager struct { // resource management + resourceManager *ResourceManager // algorithm management + algorithmManager *AlgorithmManager // task management + taskManager *TaskManager + // start channel for tasks to be started + startChan chan *structure.TaskInfo + // register callbacks + StartTaskCallback func(taskInfo *structure.TaskInfo) error } -func (s *SchedulerManager) Init() *SchedulerManager { - return &SchedulerManager{} +func (s *SchedulerManager) Init(SetTaskStatusCallback func(taskInfo *structure.TaskInfo, status structure.TaskState) error, + SetTaskErrorCallback func(taskInfo *structure.TaskInfo, errMsg string) bool) *SchedulerManager { + const defaultChanSizeConfig = "10" + chanSize := common.GetConfigDefault("start_chan_size", defaultChanSizeConfig).(string) + // Convert string to int + chanSizeInt, err := strconv.Atoi(chanSize) + if err != nil { + logrus.Errorf("failed to convert start_chan_size to int: %v", err) + logrus.Infof("using default start_chan_size: %s", defaultChanSizeConfig) + chanSizeInt, _ = strconv.Atoi(defaultChanSizeConfig) + } + startChan := make(chan *structure.TaskInfo, chanSizeInt) + s.startChan = startChan + + s.resourceManager = &ResourceManager{} + s.resourceManager.Init() + s.taskManager = &TaskManager{} + s.taskManager.Init() + s.algorithmManager = &AlgorithmManager{} + s.algorithmManager.Init() + go s.startTicker() + return s } -func (s *SchedulerManager) ReleaseByTaskID(taskID int32) { - // Implement logic to release resources by task ID +func (s *SchedulerManager) startTicker() { + // Create a ticker that triggers every 3 seconds + // TODO: make it configurable + ticker := time.Tick(3 * time.Second) - // trace tasks need these workers, check if these tasks are available + for range ticker { + //logrus.Debug("Ticker ticked") + s.TryScheduleNextTasks() + } +} + +func (s *SchedulerManager) waitingStartedTask() { + for taskInfo := range s.startChan { + if taskInfo == nil { + logrus.Warnf("recieved a nil task from startChan") + return + } + + logrus.Infof("chan received task '%d' to start", taskInfo.ID) + s.StartTaskCallback(taskInfo) + } +} + +// this make scheduler manager try to schedule next tasks +func (s *SchedulerManager) TryScheduleNextTasks() { + defer func() { + if err := recover(); err != nil { + logrus.Errorln("TryScheduleNextTasks() has been recovered:", err) + } + }() + + // TODO: make it configurable + if err := s.tryScheduleInner(true); err != nil { + logrus.Errorf("do scheduling error:%v", err) + } } -func (s *SchedulerManager) GetNextTask(spaceName string) []*structure.TaskInfo { +// Main routine to schedule tasks +func (s *SchedulerManager) tryScheduleInner(softSchedule bool) error { // Implement logic to get the next task in the queue for the given space // step 1: make sure all tasks have alloc to a worker group @@ -26,5 +92,69 @@ func (s *SchedulerManager) GetNextTask(spaceName string) []*structure.TaskInfo { // step 2: sort available tasks by priority (by calling algorithm's GetNextTask method) // step 3: return the task with the highest priority or small tasks which can be executed immediately + + // step 4: send to start channel + return nil } + +func (s *SchedulerManager) ReleaseByTaskID(taskID int32) { + // trace tasks need these workers, check if these tasks are available + s.taskManager.RemoveTask(taskID) + // release the worker group + s.resourceManager.ReleaseByTaskID(taskID) +} + +func (s *SchedulerManager) QueueTask(taskInfo *structure.TaskInfo) (bool, error) { + // make sure all tasks have alloc to a worker group + s.taskManager.QueueTask(taskInfo) + + return true, nil +} + +func (s *SchedulerManager) GetLastTask(spaceName string) *structure.TaskInfo { + // Implement logic to get the last task in the queue for the given space + return s.taskManager.GetLastTask(spaceName) +} + +func (s *SchedulerManager) IsDispatchPaused() bool { + // Implement logic to check if dispatching is paused + return s.resourceManager.IsDispatchPaused() +} + +func (s *SchedulerManager) PauseDispatch() { + // Implement logic to pause dispatching + s.resourceManager.PauseDispatch() +} + +func (s *SchedulerManager) ResumeDispatch() { + // Implement logic to resume dispatching + s.resourceManager.ResumeDispatch() +} + +func (s *SchedulerManager) AllTasksInQueue() []*structure.TaskInfo { + // Implement logic to get all tasks in the queue + return s.taskManager.GetAllTasks() +} + +func (s *SchedulerManager) TasksInQueue(space string) []*structure.TaskInfo { + // Implement logic to get tasks in the queue for a specific space + return s.taskManager.GetTasksInQueue(space) +} + +// + +func (s *SchedulerManager) IsTaskOngoing(taskID int32) bool { + // Check if the task is ongoing + return s.taskManager.IsTaskOngoing(taskID) +} + +func (s *SchedulerManager) RemoveTask(taskID int32) error { + // Remove a task from the queue + return s.taskManager.RemoveTask(taskID) +} + +func (s *SchedulerManager) GetAgent(taskInfo *structure.TaskInfo) (*Agent, AgentStatus, error) { + // Get an agent for the given task + return s.resourceManager.GetAgent(taskInfo) +} diff --git a/vermeer/apps/master/schedules/task_manager.go b/vermeer/apps/master/schedules/task_manager.go new file mode 100644 index 000000000..5c71de2d8 --- /dev/null +++ b/vermeer/apps/master/schedules/task_manager.go @@ -0,0 +1,101 @@ +package schedules + +import ( + "errors" + "vermeer/apps/structure" +) + +type TaskManager struct { + // This struct is responsible for managing tasks in the scheduling system. + // A map from task ID to TaskInfo can be used to track tasks. + allTaskMap map[int32]*structure.TaskInfo + availableTaskMap map[int32]*structure.TaskInfo + // A map from task ID to worker group can be used to track which worker group is handling which task. + workerGroupMap map[int32]string +} + +func (t *TaskManager) Init() *TaskManager { + t.allTaskMap = make(map[int32]*structure.TaskInfo) + t.availableTaskMap = make(map[int32]*structure.TaskInfo) + t.workerGroupMap = make(map[int32]string) + return t +} + +func (t *TaskManager) QueueTask(taskInfo *structure.TaskInfo) (bool, error) { + if taskInfo == nil { + return false, errors.New("the argument `taskInfo` is nil") + } + + if taskInfo.SpaceName == "" { + return false, errors.New("the property `SpaceName` of taskInfo is empty") + } + + // Add the task to the task map + t.allTaskMap[taskInfo.ID] = taskInfo + t.AssignGroup(taskInfo) + return true, nil +} + +func (t *TaskManager) RemoveTask(taskID int32) error { + if _, exists := t.allTaskMap[taskID]; !exists { + return errors.New("task not found") + } + delete(t.allTaskMap, taskID) + delete(t.workerGroupMap, taskID) + return nil +} + +// update or create a task in the task map +func (t *TaskManager) AssignGroup(taskInfo *structure.TaskInfo) error { + group := workerMgr.ApplyGroup(taskInfo.SpaceName, taskInfo.GraphName) + if group == "" { + return errors.New("failed to assign group for task") + } + t.workerGroupMap[taskInfo.ID] = group + return nil +} + +func (t *TaskManager) GetTaskByID(taskID int32) (*structure.TaskInfo, error) { + task, exists := t.allTaskMap[taskID] + if !exists { + return nil, errors.New("task not found") + } + return task, nil +} + +func (t *TaskManager) GetLastTask(spaceName string) *structure.TaskInfo { + // Implement logic to get the last task in the queue for the given space + for _, task := range t.allTaskMap { + if task.SpaceName == spaceName { + return task + } + } + return nil +} + +func (t *TaskManager) GetAllTasks() []*structure.TaskInfo { + tasks := make([]*structure.TaskInfo, 0, len(t.allTaskMap)) + for _, task := range t.allTaskMap { + tasks = append(tasks, task) + } + return tasks +} + +func (t *TaskManager) GetTasksInQueue(space string) []*structure.TaskInfo { + tasks := make([]*structure.TaskInfo, 0) + for _, task := range t.allTaskMap { + if task.SpaceName == space { + tasks = append(tasks, task) + } + } + return tasks +} + +func (t *TaskManager) IsTaskOngoing(taskID int32) bool { + // Check if the task is currently ongoing + task, exists := t.allTaskMap[taskID] + if !exists { + return false + } + return task.State == structure.TaskStateCreated +} From 2aa8c2a68cf00e9befdabba5f38c30faec9f9c87 Mon Sep 17 00:00:00 2001 From: ethereal Date: Tue, 22 Jul 2025 01:53:49 +0800 Subject: [PATCH 05/27] feat: pass test --- vermeer/apps/master/bl/grpc_handlers.go | 2 + vermeer/apps/master/bl/scheduler_bl.go | 291 +++++++++++++----- .../master/schedules/algorithm_manager.go | 64 +++- vermeer/apps/master/schedules/broker.go | 19 +- .../apps/master/schedules/resource_manager.go | 115 ++++--- .../master/schedules/scheduler_manager.go | 160 ---------- vermeer/apps/master/schedules/task_manager.go | 35 ++- vermeer/apps/master/workers/worker_manager.go | 4 + 8 files changed, 396 insertions(+), 294 deletions(-) delete mode 100644 vermeer/apps/master/schedules/scheduler_manager.go diff --git a/vermeer/apps/master/bl/grpc_handlers.go b/vermeer/apps/master/bl/grpc_handlers.go index e1c235584..c5ae16987 100644 --- a/vermeer/apps/master/bl/grpc_handlers.go +++ b/vermeer/apps/master/bl/grpc_handlers.go @@ -26,6 +26,7 @@ import ( "time" "vermeer/apps/compute" "vermeer/apps/graphio" + "vermeer/apps/master/schedules" "vermeer/apps/master/threshold" "vermeer/apps/master/workers" pb "vermeer/apps/protos" @@ -99,6 +100,7 @@ func (h *ServerHandler) SayHelloMaster(ctx context.Context, req *pb.HelloMasterR } _, err = workerMgr.AddWorker(reqWorker) + Scheduler.ChangeWorkerStatus(reqWorker.Name, schedules.WorkerOngoingStatusIdle) if err != nil { logrus.Errorf("failed to add a WorkerClient to the WorkerManager, error: %s", err) return &pb.HelloMasterResp{}, err diff --git a/vermeer/apps/master/bl/scheduler_bl.go b/vermeer/apps/master/bl/scheduler_bl.go index 6a0200940..ffbf0229c 100644 --- a/vermeer/apps/master/bl/scheduler_bl.go +++ b/vermeer/apps/master/bl/scheduler_bl.go @@ -19,7 +19,9 @@ package bl import ( "errors" + "strconv" "time" + "vermeer/apps/common" "vermeer/apps/master/schedules" "vermeer/apps/structure" @@ -28,16 +30,111 @@ import ( type ScheduleBl struct { structure.MutexLocker - schedulerManager *schedules.SchedulerManager + // resource management + resourceManager *schedules.ResourceManager + // algorithm management + algorithmManager *schedules.AlgorithmManager + // task management + taskManager *schedules.TaskManager + // start channel for tasks to be started + startChan chan *structure.TaskInfo } func (s *ScheduleBl) Init() { - s.schedulerManager = &schedules.SchedulerManager{} - s.schedulerManager.Init(taskMgr.SetState, taskMgr.SetError) + logrus.Info("Initializing ScheduleBl...") + const defaultChanSizeConfig = "10" + chanSize := common.GetConfigDefault("start_chan_size", defaultChanSizeConfig).(string) + // Convert string to int + chanSizeInt, err := strconv.Atoi(chanSize) + if err != nil { + logrus.Errorf("failed to convert start_chan_size to int: %v", err) + logrus.Infof("using default start_chan_size: %s", defaultChanSizeConfig) + chanSizeInt, _ = strconv.Atoi(defaultChanSizeConfig) + } + startChan := make(chan *structure.TaskInfo, chanSizeInt) + s.startChan = startChan + + s.resourceManager = &schedules.ResourceManager{} + s.resourceManager.Init() + s.taskManager = &schedules.TaskManager{} + s.taskManager.Init() + s.algorithmManager = &schedules.AlgorithmManager{} + s.algorithmManager.Init() + go s.startTicker() + go s.waitingStartedTask() } -func (s *ScheduleBl) PeekSpaceTail(space string) *structure.TaskInfo { - return s.schedulerManager.GetLastTask(space) +func (s *ScheduleBl) startTicker() { + // Create a ticker that triggers every 3 seconds + // TODO: make it configurable + ticker := time.Tick(3 * time.Second) + + for range ticker { + logrus.Debug("Ticker ticked") + s.TryScheduleNextTasks() + } +} + +// this make scheduler manager try to schedule next tasks +func (s *ScheduleBl) TryScheduleNextTasks() { + defer func() { + if err := recover(); err != nil { + logrus.Errorln("TryScheduleNextTasks() has been recovered:", err) + } + }() + + // TODO: make it configurable + if err := s.tryScheduleInner(true); err != nil { + logrus.Errorf("do scheduling error:%v", err) + } +} + +// Main routine to schedule tasks +func (s *ScheduleBl) tryScheduleInner(softSchedule bool) error { + // Implement logic to get the next task in the queue for the given space + + // step 1: make sure all tasks have alloc to a worker group + // This is done by the TaskManager, which assigns a worker group to each task + + // step 2: get available resources and tasks + logrus.Debugf("scheduling next tasks, softSchedule: %v", softSchedule) + availableWorkers := s.resourceManager.GetIdleWorkers() + allTasks := s.taskManager.GetAllTasksNotRunning() + if len(allTasks) == 0 || len(availableWorkers) == 0 { + logrus.Debugf("no available tasks or workers, availableTasks: %d, availableWorkers: %d", + len(allTasks), len(availableWorkers)) + return nil + } + logrus.Debugf("available tasks: %d, available workers: %d", len(allTasks), len(availableWorkers)) + + // step 3: return the task with the highest priority or small tasks which can be executed immediately + workerGroupMap := s.taskManager.GetWorkerGroupMap() + nextTasks, err := s.algorithmManager.ScheduleNextTasks(allTasks, workerGroupMap, availableWorkers, softSchedule) + if err != nil { + logrus.Errorf("failed to schedule next tasks: %v", err) + return err + } + logrus.Debugf("scheduled %d tasks", len(nextTasks)) + // step 4: send to start channel + for _, task := range nextTasks { + if task == nil { + logrus.Warnf("received a nil task from algorithm manager") + continue + } + if task.State != structure.TaskStateWaiting { + logrus.Warnf("task '%d' is not in waiting state, current state: %s", task.ID, task.State) + continue + } + logrus.Infof("scheduling task '%d' with type '%s' to start channel", task.ID, task.Type) + select { + case s.startChan <- task: + logrus.Infof("task '%d' sent to start channel", task.ID) + default: + logrus.Warnf("start channel is full, task '%d' could not be sent", task.ID) + } + } + + return nil } // QueueTask Add the task to the inner queue. @@ -58,7 +155,8 @@ func (s *ScheduleBl) QueueTask(taskInfo *structure.TaskInfo) (bool, error) { } // Notice: Ensure successful invocation. - ok, err := s.schedulerManager.QueueTask(taskInfo) + // make sure all tasks have alloc to a worker group + ok, err := s.taskManager.QueueTask(taskInfo) if err != nil { taskMgr.SetError(taskInfo, err.Error()) return ok, err @@ -67,82 +165,43 @@ func (s *ScheduleBl) QueueTask(taskInfo *structure.TaskInfo) (bool, error) { return ok, nil } -func (s *ScheduleBl) CancelTask(taskInfo *structure.TaskInfo) error { - if taskInfo == nil { - return errors.New("the argument `taskInfo` is nil") - } - - isHeadTask := s.schedulerManager.IsTaskOngoing(taskInfo.ID) - task := s.schedulerManager.RemoveTask(taskInfo.ID) - // err := s.schedulerManager.CancelTask(taskInfo) - isInQueue := false - if task != nil { - logrus.Infof("removed task '%d' from space queue", taskInfo.ID) - isInQueue = true - } +// ******** CloseCurrent ******** - if isInQueue && !isHeadTask { - if err := taskMgr.SetState(taskInfo, structure.TaskStateCanceled); err != nil { - return err - } - - logrus.Infof("set task '%d' to TaskStateCanceled", taskInfo.ID) - } else { - logrus.Infof("sending task '%d' to task canceler", taskInfo.ID) - return s.handleCancelTask(taskInfo) - } +func (s *ScheduleBl) CloseCurrent(taskId int32) error { + // trace tasks need these workers, check if these tasks are available + s.taskManager.RemoveTask(taskId) + // release the worker group + s.resourceManager.ReleaseByTaskID(taskId) + logrus.Infof("invoke dispatch when task '%d' is closed", taskId) + s.TryScheduleNextTasks() return nil } -func (s *ScheduleBl) IsDispatchPaused() bool { - return s.schedulerManager.IsDispatchPaused() -} -func (s *ScheduleBl) PauseDispatch() { - s.schedulerManager.PauseDispatch() -} - -func (s *ScheduleBl) ResumeDispatch() { - s.schedulerManager.ResumeDispatch() -} - -func (s *ScheduleBl) AllTasksInQueue() []*structure.TaskInfo { - return s.schedulerManager.AllTasksInQueue() -} +func (s *ScheduleBl) ChangeWorkerStatus(workerName string, status schedules.WorkerOngoingStatus) { + s.resourceManager.ChangeWorkerStatus(workerName, status) -func (s *ScheduleBl) TasksInQueue(space string) []*structure.TaskInfo { - return s.schedulerManager.TasksInQueue(space) -} - -func (s *ScheduleBl) CloseCurrent(taskId int32) error { - s.schedulerManager.ReleaseByTaskID(taskId) - - logrus.Infof("invoke dispatch when task '%d' is closed", taskId) - s.schedulerManager.TryScheduleNextTasks() - return nil + logrus.Infof("worker '%s' status changed to '%s'", workerName, status) + // After changing the worker status, we may need to reschedule tasks + s.TryScheduleNextTasks() } -func (s *ScheduleBl) handleCancelTask(taskInfo *structure.TaskInfo) error { - logrus.Infof("received task '%d' to cancel", taskInfo.ID) - canceler, err := NewTaskCanceler(taskInfo) - if err != nil { - logrus.Errorf("failed to create new TaskCanceler err: %v", err) - taskMgr.SetError(taskInfo, err.Error()) - return err - } +// ******** START TASK ******** +func (s *ScheduleBl) waitingStartedTask() { + for taskInfo := range s.startChan { + if taskInfo == nil { + logrus.Warnf("recieved a nil task from startChan") + return + } - if err := canceler.CancelTask(); err != nil { - logrus.Errorf("failed to cancel task '%d', caused by: %v", taskInfo.ID, err) - taskMgr.SetError(taskInfo, err.Error()) - return err + logrus.Infof("chan received task '%d' to start", taskInfo.ID) + s.handleStartTask(taskInfo) } - - return nil } // now, start task! func (s *ScheduleBl) handleStartTask(taskInfo *structure.TaskInfo) { - agent, status, err := s.schedulerManager.GetAgent(taskInfo) + agent, status, err := s.resourceManager.GetAgentAndAssignTask(taskInfo) if err != nil { logrus.Errorf("apply agent error: %v", err) @@ -150,14 +209,14 @@ func (s *ScheduleBl) handleStartTask(taskInfo *structure.TaskInfo) { return } - // switch status { - // case schedules.AgentStatusNoWorker: - // fallthrough - // case schedules.AgentStatusWorkerNotReady: - // logrus.Warnf("failed to apply an agent for task '%d', graph: %s/%s, status: %s", - // taskInfo.ID, taskInfo.SpaceName, taskInfo.GraphName, status) - // return - // } + switch status { + case schedules.AgentStatusNoWorker: + fallthrough + case schedules.AgentStatusWorkerNotReady: + logrus.Warnf("failed to apply an agent for task '%d', graph: %s/%s, status: %s", + taskInfo.ID, taskInfo.SpaceName, taskInfo.GraphName, status) + return + } if agent == nil { logrus.Infof("no available agent for task '%d', graph: %s/%s, status: %s", @@ -205,3 +264,83 @@ func (s *ScheduleBl) startWaitingTask(agent *schedules.Agent, taskInfo *structur taskMgr.SetError(taskInfo, err.Error()) } } + +// ********* CANCEL TASK ******** +// handle cancel task + +func (s *ScheduleBl) CancelTask(taskInfo *structure.TaskInfo) error { + if taskInfo == nil { + return errors.New("the argument `taskInfo` is nil") + } + + isHeadTask := s.taskManager.IsTaskOngoing(taskInfo.ID) + task := s.taskManager.RemoveTask(taskInfo.ID) + // err := s.taskManager.CancelTask(taskInfo) + isInQueue := false + if task != nil { + logrus.Infof("removed task '%d' from space queue", taskInfo.ID) + isInQueue = true + } + + if isInQueue && !isHeadTask { + if err := taskMgr.SetState(taskInfo, structure.TaskStateCanceled); err != nil { + return err + } + + logrus.Infof("set task '%d' to TaskStateCanceled", taskInfo.ID) + } else { + logrus.Infof("sending task '%d' to task canceler", taskInfo.ID) + return s.handleCancelTask(taskInfo) + } + + return nil +} + +func (s *ScheduleBl) handleCancelTask(taskInfo *structure.TaskInfo) error { + logrus.Infof("received task '%d' to cancel", taskInfo.ID) + canceler, err := NewTaskCanceler(taskInfo) + if err != nil { + logrus.Errorf("failed to create new TaskCanceler err: %v", err) + taskMgr.SetError(taskInfo, err.Error()) + return err + } + + if err := canceler.CancelTask(); err != nil { + logrus.Errorf("failed to cancel task '%d', caused by: %v", taskInfo.ID, err) + taskMgr.SetError(taskInfo, err.Error()) + return err + } + + return nil +} + +// ** Other Methods ** + +func (s *ScheduleBl) PeekSpaceTail(space string) *structure.TaskInfo { + return s.taskManager.GetLastTask(space) +} + +func (s *ScheduleBl) IsDispatchPaused() bool { + // Implement logic to check if dispatching is paused + return s.algorithmManager.IsDispatchPaused() +} + +func (s *ScheduleBl) PauseDispatch() { + // Implement logic to pause dispatching + s.algorithmManager.PauseDispatch() +} + +func (s *ScheduleBl) ResumeDispatch() { + // Implement logic to resume dispatching + s.algorithmManager.ResumeDispatch() +} + +func (s *ScheduleBl) AllTasksInQueue() []*structure.TaskInfo { + // Implement logic to get all tasks in the queue + return s.taskManager.GetAllTasks() +} + +func (s *ScheduleBl) TasksInQueue(space string) []*structure.TaskInfo { + // Implement logic to get tasks in the queue for a specific space + return s.taskManager.GetTasksInQueue(space) +} diff --git a/vermeer/apps/master/schedules/algorithm_manager.go b/vermeer/apps/master/schedules/algorithm_manager.go index de769cf1c..42fe28a88 100644 --- a/vermeer/apps/master/schedules/algorithm_manager.go +++ b/vermeer/apps/master/schedules/algorithm_manager.go @@ -1,17 +1,77 @@ package schedules +import "vermeer/apps/structure" + type Algorithm interface { // Name returns the name of the algorithm Name() string // Execute runs the algorithm with the provided parameters - Execute(params map[string]interface{}) (interface{}, error) - // Validate checks if the provided parameters are valid for the algorithm + ScheduleNextTasks(allTasks []*structure.TaskInfo, workerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) } type AlgorithmManager struct { supportedAlgorithms map[string]Algorithm + nowAlgorithm string + dispatchPaused bool } func (am *AlgorithmManager) Init() { am.supportedAlgorithms = make(map[string]Algorithm) + am.dispatchPaused = false + // Register default algorithms + am.RegisterAlgorithm(&FIFOAlgorithm{}) + am.nowAlgorithm = "FIFO" // Default algorithm +} + +func (am *AlgorithmManager) RegisterAlgorithm(algorithm Algorithm) { + if algorithm == nil { + return + } + name := algorithm.Name() + if _, exists := am.supportedAlgorithms[name]; exists { + return // Algorithm already registered + } + am.supportedAlgorithms[name] = algorithm +} + +func (am *AlgorithmManager) IsDispatchPaused() bool { + return am.dispatchPaused +} + +func (am *AlgorithmManager) PauseDispatch() { + am.dispatchPaused = true +} + +func (am *AlgorithmManager) ResumeDispatch() { + am.dispatchPaused = false +} + +func (am *AlgorithmManager) ScheduleNextTasks(allTasks []*structure.TaskInfo, workerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { + if am.dispatchPaused { + return nil, nil // No tasks to schedule if dispatch is paused + } + + tasks, err := am.supportedAlgorithms[am.nowAlgorithm].ScheduleNextTasks(allTasks, workerGroupMap, idleWorkers, softSchedule) + if err != nil { + return nil, err + } + + return tasks, nil +} + +type FIFOAlgorithm struct{} + +func (f *FIFOAlgorithm) Name() string { + return "FIFO" +} + +func (f *FIFOAlgorithm) ScheduleNextTasks(allTasks []*structure.TaskInfo, workerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { + if len(allTasks) == 0 { + return nil, nil // No tasks to schedule + } + + // For FIFO, we simply return the available tasks in the order they are provided + first_task := allTasks[0] + + return []*structure.TaskInfo{first_task}, nil } diff --git a/vermeer/apps/master/schedules/broker.go b/vermeer/apps/master/schedules/broker.go index fabdf415a..b44ca18b5 100644 --- a/vermeer/apps/master/schedules/broker.go +++ b/vermeer/apps/master/schedules/broker.go @@ -23,6 +23,7 @@ import ( "github.com/sirupsen/logrus" + "vermeer/apps/master/workers" . "vermeer/apps/master/workers" ) @@ -72,42 +73,42 @@ func (b *Broker) AllAgents() []*Agent { return res } -func (b *Broker) ApplyAgent(taskInfo *structure.TaskInfo) (*Agent, AgentStatus, error) { +func (b *Broker) ApplyAgent(taskInfo *structure.TaskInfo) (*Agent, AgentStatus, map[string]*workers.WorkerClient, error) { if taskInfo == nil { - return nil, AgentStatusError, fmt.Errorf("taskInfo is nil") + return nil, AgentStatusError, nil, fmt.Errorf("taskInfo is nil") } defer b.Unlock(b.Lock()) agent, workers, err := b.getAgent(taskInfo) if err != nil { - return nil, AgentStatusError, err + return nil, AgentStatusError, nil, err } if agent == nil { - return nil, AgentStatusPending, nil + return nil, AgentStatusPending, nil, nil } if workers == nil || len(workers) == 0 { - return nil, AgentStatusNoWorker, nil + return nil, AgentStatusNoWorker, nil, nil } if !b.isWorkersReady(workers) { logrus.Warnf("the workers of agent '%s' are not ready", agent.GroupName()) - return nil, AgentStatusWorkerNotReady, nil + return nil, AgentStatusWorkerNotReady, nil, nil } if b.isAgentBusy(agent) { - return nil, AgentStatusAgentBusy, nil + return nil, AgentStatusAgentBusy, nil, nil } if b.isWorkerBusy(workers, agent) { - return nil, AgentStatusWorkerBusy, nil + return nil, AgentStatusWorkerBusy, nil, nil } agent.AssignTask(taskInfo) - return agent, AgentStatusOk, nil + return agent, AgentStatusOk, workers, nil } // func (b *Broker) isAgentReady(taskInfo *structure.TaskInfo, agent *Agent) bool { diff --git a/vermeer/apps/master/schedules/resource_manager.go b/vermeer/apps/master/schedules/resource_manager.go index 415045514..c01759e70 100644 --- a/vermeer/apps/master/schedules/resource_manager.go +++ b/vermeer/apps/master/schedules/resource_manager.go @@ -14,7 +14,10 @@ const ( ) type ResourceManager struct { - workerStatus map[string]WorkerOngoingStatus + structure.MutexLocker + workerStatus map[string]WorkerOngoingStatus + runningWorkerTasks map[string][]int32 // worker ID to list of running task IDs + availableWorkerGroups map[string]bool // worker group name to availability status // broker just responsible for communication with workers // it can not apply tasks to workers directly broker *Broker @@ -22,43 +25,52 @@ type ResourceManager struct { func (rm *ResourceManager) Init() { rm.workerStatus = make(map[string]WorkerOngoingStatus) -} - -func (rm *ResourceManager) Lock() { - // Implement locking logic if necessary -} - -func (rm *ResourceManager) Unlock() { - // Implement unlocking logic if necessary + rm.runningWorkerTasks = make(map[string][]int32) + rm.availableWorkerGroups = make(map[string]bool) + rm.broker = new(Broker).Init() } func (rm *ResourceManager) ReleaseByTaskID(taskID int32) { - rm.Lock() - defer rm.Unlock() + defer rm.Unlock(rm.Lock()) for worker, status := range rm.workerStatus { if status == WorkerOngoingStatusRunning && rm.isTaskRunningOnWorker(worker, taskID) { delete(rm.workerStatus, worker) - break + if tasks, exists := rm.runningWorkerTasks[worker]; exists { + for i, id := range tasks { + if id == taskID { + rm.runningWorkerTasks[worker] = append(tasks[:i], tasks[i+1:]...) + if len(rm.runningWorkerTasks[worker]) == 0 { + delete(rm.runningWorkerTasks, worker) + } + break + } + } + } + rm.changeWorkerStatus(worker, WorkerOngoingStatusIdle) } } } func (rm *ResourceManager) isTaskRunningOnWorker(worker string, taskID int32) bool { - // Implement logic to check if a task is running on a specific worker - // This is a placeholder implementation - return false // Replace with actual logic + if tasks, exists := rm.runningWorkerTasks[worker]; exists { + for _, id := range tasks { + if id == taskID { + return true + } + } + } + return false } -func (rm *ResourceManager) GetAgent(taskInfo *structure.TaskInfo) (*Agent, AgentStatus, error) { +func (rm *ResourceManager) GetAgentAndAssignTask(taskInfo *structure.TaskInfo) (*Agent, AgentStatus, error) { if taskInfo == nil { return nil, AgentStatusError, errors.New("taskInfo is nil") } - rm.Lock() - defer rm.Unlock() + defer rm.Unlock(rm.Lock()) - agent, status, err := rm.broker.ApplyAgent(taskInfo) + agent, status, workers, err := rm.broker.ApplyAgent(taskInfo) if err != nil { return nil, AgentStatusError, err } @@ -69,37 +81,60 @@ func (rm *ResourceManager) GetAgent(taskInfo *structure.TaskInfo) (*Agent, Agent // Assign the task to the agent agent.AssignTask(taskInfo) + for _, worker := range workers { + if worker == nil { + continue + } + rm.workerStatus[worker.Name] = WorkerOngoingStatusRunning + if _, exists := rm.runningWorkerTasks[worker.Name]; !exists { + rm.runningWorkerTasks[worker.Name] = []int32{} + } + rm.runningWorkerTasks[worker.Name] = append(rm.runningWorkerTasks[worker.Name], taskInfo.ID) + } + return agent, status, nil } -func (rm *ResourceManager) IsDispatchPaused() bool { - rm.Lock() - defer rm.Unlock() +func (rm *ResourceManager) GetIdleWorkers() []string { + defer rm.Unlock(rm.Lock()) - for _, status := range rm.workerStatus { - if status == WorkerOngoingStatusPaused { - return true + idleWorkers := make([]string, 0) + for worker, status := range rm.workerStatus { + if status == WorkerOngoingStatusIdle { + idleWorkers = append(idleWorkers, worker) } } - return false + return idleWorkers } -func (rm *ResourceManager) PauseDispatch() { - rm.Lock() - defer rm.Unlock() - - for worker := range rm.workerStatus { - rm.workerStatus[worker] = WorkerOngoingStatusPaused +func (rm *ResourceManager) changeWorkerStatus(workerName string, status WorkerOngoingStatus) { + rm.workerStatus[workerName] = status + + if status == WorkerOngoingStatusIdle { + workerInfo := workerMgr.GetWorkerInfo(workerName) + + // get worker group name + groupName := workerInfo.Group + if groupName != "" { + // check all workers in this group are idle + allIdle := true + for _, w := range workerMgr.GetGroupWorkers(groupName) { + if rm.workerStatus[w.Name] != WorkerOngoingStatusIdle { + allIdle = false + break + } + } + if allIdle { + rm.availableWorkerGroups[groupName] = true + } else { + rm.availableWorkerGroups[groupName] = false + } + } } } -func (rm *ResourceManager) ResumeDispatch() { - rm.Lock() - defer rm.Unlock() +func (rm *ResourceManager) ChangeWorkerStatus(workerName string, status WorkerOngoingStatus) { + defer rm.Unlock(rm.Lock()) - for worker := range rm.workerStatus { - if rm.workerStatus[worker] == WorkerOngoingStatusPaused { - rm.workerStatus[worker] = WorkerOngoingStatusIdle - } - } + rm.changeWorkerStatus(workerName, status) } diff --git a/vermeer/apps/master/schedules/scheduler_manager.go b/vermeer/apps/master/schedules/scheduler_manager.go deleted file mode 100644 index 9fcd68243..000000000 --- a/vermeer/apps/master/schedules/scheduler_manager.go +++ /dev/null @@ -1,160 +0,0 @@ -package schedules - -import ( - "strconv" - "time" - "vermeer/apps/common" - "vermeer/apps/structure" - - "github.com/sirupsen/logrus" -) - -type SchedulerManager struct { - // resource management - resourceManager *ResourceManager - // algorithm management - algorithmManager *AlgorithmManager - // task management - taskManager *TaskManager - // start channel for tasks to be started - startChan chan *structure.TaskInfo - // register callbacks - StartTaskCallback func(taskInfo *structure.TaskInfo) error -} - -func (s *SchedulerManager) Init(SetTaskStatusCallback func(taskInfo *structure.TaskInfo, status structure.TaskState) error, - SetTaskErrorCallback func(taskInfo *structure.TaskInfo, errMsg string) bool) *SchedulerManager { - const defaultChanSizeConfig = "10" - chanSize := common.GetConfigDefault("start_chan_size", defaultChanSizeConfig).(string) - // Convert string to int - chanSizeInt, err := strconv.Atoi(chanSize) - if err != nil { - logrus.Errorf("failed to convert start_chan_size to int: %v", err) - logrus.Infof("using default start_chan_size: %s", defaultChanSizeConfig) - chanSizeInt, _ = strconv.Atoi(defaultChanSizeConfig) - } - startChan := make(chan *structure.TaskInfo, chanSizeInt) - s.startChan = startChan - - s.resourceManager = &ResourceManager{} - s.resourceManager.Init() - s.taskManager = &TaskManager{} - s.taskManager.Init() - s.algorithmManager = &AlgorithmManager{} - s.algorithmManager.Init() - go s.startTicker() - return s -} - -func (s *SchedulerManager) startTicker() { - // Create a ticker that triggers every 3 seconds - // TODO: make it configurable - ticker := time.Tick(3 * time.Second) - - for range ticker { - //logrus.Debug("Ticker ticked") - s.TryScheduleNextTasks() - } -} - -func (s *SchedulerManager) waitingStartedTask() { - for taskInfo := range s.startChan { - if taskInfo == nil { - logrus.Warnf("recieved a nil task from startChan") - return - } - - logrus.Infof("chan received task '%d' to start", taskInfo.ID) - s.StartTaskCallback(taskInfo) - } -} - -// this make scheduler manager try to schedule next tasks -func (s *SchedulerManager) TryScheduleNextTasks() { - defer func() { - if err := recover(); err != nil { - logrus.Errorln("TryScheduleNextTasks() has been recovered:", err) - } - }() - - // TODO: make it configurable - if err := s.tryScheduleInner(true); err != nil { - logrus.Errorf("do scheduling error:%v", err) - } -} - -// Main routine to schedule tasks -func (s *SchedulerManager) tryScheduleInner(softSchedule bool) error { - // Implement logic to get the next task in the queue for the given space - - // step 1: make sure all tasks have alloc to a worker group - - // step 2: sort available tasks by priority (by calling algorithm's GetNextTask method) - - // step 3: return the task with the highest priority or small tasks which can be executed immediately - - // step 4: send to start channel - - return nil -} - -func (s *SchedulerManager) ReleaseByTaskID(taskID int32) { - // trace tasks need these workers, check if these tasks are available - s.taskManager.RemoveTask(taskID) - // release the worker group - s.resourceManager.ReleaseByTaskID(taskID) -} - -func (s *SchedulerManager) QueueTask(taskInfo *structure.TaskInfo) (bool, error) { - // make sure all tasks have alloc to a worker group - s.taskManager.QueueTask(taskInfo) - - return true, nil -} - -func (s *SchedulerManager) GetLastTask(spaceName string) *structure.TaskInfo { - // Implement logic to get the last task in the queue for the given space - return s.taskManager.GetLastTask(spaceName) -} - -func (s *SchedulerManager) IsDispatchPaused() bool { - // Implement logic to check if dispatching is paused - return s.resourceManager.IsDispatchPaused() -} - -func (s *SchedulerManager) PauseDispatch() { - // Implement logic to pause dispatching - s.resourceManager.PauseDispatch() -} - -func (s *SchedulerManager) ResumeDispatch() { - // Implement logic to resume dispatching - s.resourceManager.ResumeDispatch() -} - -func (s *SchedulerManager) AllTasksInQueue() []*structure.TaskInfo { - // Implement logic to get all tasks in the queue - return s.taskManager.GetAllTasks() -} - -func (s *SchedulerManager) TasksInQueue(space string) []*structure.TaskInfo { - // Implement logic to get tasks in the queue for a specific space - return s.taskManager.GetTasksInQueue(space) -} - -// - -func (s *SchedulerManager) IsTaskOngoing(taskID int32) bool { - // Check if the task is ongoing - return s.taskManager.IsTaskOngoing(taskID) -} - -func (s *SchedulerManager) RemoveTask(taskID int32) error { - // Remove a task from the queue - return s.taskManager.RemoveTask(taskID) -} - -func (s *SchedulerManager) GetAgent(taskInfo *structure.TaskInfo) (*Agent, AgentStatus, error) { - // Get an agent for the given task - return s.resourceManager.GetAgent(taskInfo) -} diff --git a/vermeer/apps/master/schedules/task_manager.go b/vermeer/apps/master/schedules/task_manager.go index 5c71de2d8..4c3c39181 100644 --- a/vermeer/apps/master/schedules/task_manager.go +++ b/vermeer/apps/master/schedules/task_manager.go @@ -8,15 +8,14 @@ import ( type TaskManager struct { // This struct is responsible for managing tasks in the scheduling system. // A map from task ID to TaskInfo can be used to track tasks. - allTaskMap map[int32]*structure.TaskInfo - availableTaskMap map[int32]*structure.TaskInfo + allTaskMap map[int32]*structure.TaskInfo + allTaskQueue []*structure.TaskInfo // A map from task ID to worker group can be used to track which worker group is handling which task. workerGroupMap map[int32]string } func (t *TaskManager) Init() *TaskManager { t.allTaskMap = make(map[int32]*structure.TaskInfo) - t.availableTaskMap = make(map[int32]*structure.TaskInfo) t.workerGroupMap = make(map[int32]string) return t } @@ -65,9 +64,12 @@ func (t *TaskManager) GetTaskByID(taskID int32) (*structure.TaskInfo, error) { func (t *TaskManager) GetLastTask(spaceName string) *structure.TaskInfo { // Implement logic to get the last task in the queue for the given space - for _, task := range t.allTaskMap { - if task.SpaceName == spaceName { - return task + if len(t.allTaskQueue) == 0 { + return nil + } + for i := len(t.allTaskQueue) - 1; i >= 0; i-- { + if t.allTaskQueue[i].SpaceName == spaceName { + return t.allTaskQueue[i] } } return nil @@ -81,9 +83,19 @@ func (t *TaskManager) GetAllTasks() []*structure.TaskInfo { return tasks } +func (t *TaskManager) GetAllTasksNotRunning() []*structure.TaskInfo { + tasks := make([]*structure.TaskInfo, 0, len(t.allTaskMap)) + for _, task := range t.allTaskMap { + if task.State == structure.TaskStateWaiting { + tasks = append(tasks, task) + } + } + return tasks +} + func (t *TaskManager) GetTasksInQueue(space string) []*structure.TaskInfo { tasks := make([]*structure.TaskInfo, 0) - for _, task := range t.allTaskMap { + for _, task := range t.allTaskQueue { if task.SpaceName == space { tasks = append(tasks, task) } @@ -91,6 +103,15 @@ func (t *TaskManager) GetTasksInQueue(space string) []*structure.TaskInfo { return tasks } +func (t *TaskManager) GetWorkerGroupMap() map[int32]string { + // Return a copy of the worker group map to avoid external modifications + groupMap := make(map[int32]string, len(t.workerGroupMap)) + for k, v := range t.workerGroupMap { + groupMap[k] = v + } + return groupMap +} + func (t *TaskManager) IsTaskOngoing(taskID int32) bool { // Check if the task is currently ongoing task, exists := t.allTaskMap[taskID] diff --git a/vermeer/apps/master/workers/worker_manager.go b/vermeer/apps/master/workers/worker_manager.go index bfaec0b8d..f2ea2fb62 100644 --- a/vermeer/apps/master/workers/worker_manager.go +++ b/vermeer/apps/master/workers/worker_manager.go @@ -577,6 +577,10 @@ func (wm *workerManager) getGroupWorkers(workerGroup string) []*WorkerClient { return workers } +func (wm *workerManager) GetGroupWorkers(workerGroup string) []*WorkerClient { + return wm.getGroupWorkers(workerGroup) +} + func (wm *workerManager) getGroupWorkerMap(workerGroup string) map[string]*WorkerClient { workerMap := make(map[string]*WorkerClient) From 095b6dabea61501648d6c69b1c90bc0b1b3ea4a4 Mon Sep 17 00:00:00 2001 From: ethereal Date: Tue, 22 Jul 2025 15:20:26 +0800 Subject: [PATCH 06/27] feat: a tmp version of optimizing --- vermeer/apps/master/bl/scheduler_bl.go | 18 +++++++++------- .../master/schedules/algorithm_manager.go | 21 ++++++++++++------- vermeer/apps/master/schedules/task_manager.go | 16 +++++++------- vermeer/vermeer_test.go | 5 +++-- 4 files changed, 35 insertions(+), 25 deletions(-) diff --git a/vermeer/apps/master/bl/scheduler_bl.go b/vermeer/apps/master/bl/scheduler_bl.go index ffbf0229c..6a27c4dd1 100644 --- a/vermeer/apps/master/bl/scheduler_bl.go +++ b/vermeer/apps/master/bl/scheduler_bl.go @@ -92,24 +92,25 @@ func (s *ScheduleBl) TryScheduleNextTasks() { // Main routine to schedule tasks func (s *ScheduleBl) tryScheduleInner(softSchedule bool) error { // Implement logic to get the next task in the queue for the given space + defer s.Unlock(s.Lock()) // step 1: make sure all tasks have alloc to a worker group // This is done by the TaskManager, which assigns a worker group to each task // step 2: get available resources and tasks logrus.Debugf("scheduling next tasks, softSchedule: %v", softSchedule) - availableWorkers := s.resourceManager.GetIdleWorkers() - allTasks := s.taskManager.GetAllTasksNotRunning() - if len(allTasks) == 0 || len(availableWorkers) == 0 { - logrus.Debugf("no available tasks or workers, availableTasks: %d, availableWorkers: %d", - len(allTasks), len(availableWorkers)) + idleWorkers := s.resourceManager.GetIdleWorkers() + waitingTasks := s.taskManager.GetAllTasksWaitng() + if len(waitingTasks) == 0 || len(idleWorkers) == 0 { + logrus.Debugf("no available tasks or workers, waitingTasks: %d, idleWorkers: %d", + len(waitingTasks), len(idleWorkers)) return nil } - logrus.Debugf("available tasks: %d, available workers: %d", len(allTasks), len(availableWorkers)) + logrus.Debugf("waiting tasks: %d, idle workers: %d", len(waitingTasks), len(idleWorkers)) // step 3: return the task with the highest priority or small tasks which can be executed immediately - workerGroupMap := s.taskManager.GetWorkerGroupMap() - nextTasks, err := s.algorithmManager.ScheduleNextTasks(allTasks, workerGroupMap, availableWorkers, softSchedule) + taskToWorkerGroupMap := s.taskManager.GetTaskToWorkerGroupMap() + nextTasks, err := s.algorithmManager.ScheduleNextTasks(waitingTasks, taskToWorkerGroupMap, idleWorkers, softSchedule) if err != nil { logrus.Errorf("failed to schedule next tasks: %v", err) return err @@ -239,6 +240,7 @@ func (s *ScheduleBl) startWaitingTask(agent *schedules.Agent, taskInfo *structur } }() + // TODO: Is here need a lock? TOCTTOU if taskInfo.State != structure.TaskStateWaiting { logrus.Errorf("task state is not in 'Waiting' state, taskID: %v", taskInfo) return diff --git a/vermeer/apps/master/schedules/algorithm_manager.go b/vermeer/apps/master/schedules/algorithm_manager.go index 42fe28a88..3c7bfd88e 100644 --- a/vermeer/apps/master/schedules/algorithm_manager.go +++ b/vermeer/apps/master/schedules/algorithm_manager.go @@ -6,7 +6,7 @@ type Algorithm interface { // Name returns the name of the algorithm Name() string // Execute runs the algorithm with the provided parameters - ScheduleNextTasks(allTasks []*structure.TaskInfo, workerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) + ScheduleNextTasks(waitingTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) } type AlgorithmManager struct { @@ -46,12 +46,12 @@ func (am *AlgorithmManager) ResumeDispatch() { am.dispatchPaused = false } -func (am *AlgorithmManager) ScheduleNextTasks(allTasks []*structure.TaskInfo, workerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { +func (am *AlgorithmManager) ScheduleNextTasks(waitingTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { if am.dispatchPaused { return nil, nil // No tasks to schedule if dispatch is paused } - tasks, err := am.supportedAlgorithms[am.nowAlgorithm].ScheduleNextTasks(allTasks, workerGroupMap, idleWorkers, softSchedule) + tasks, err := am.supportedAlgorithms[am.nowAlgorithm].ScheduleNextTasks(waitingTasks, taskToWorkerGroupMap, idleWorkers, softSchedule) if err != nil { return nil, err } @@ -65,13 +65,20 @@ func (f *FIFOAlgorithm) Name() string { return "FIFO" } -func (f *FIFOAlgorithm) ScheduleNextTasks(allTasks []*structure.TaskInfo, workerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { - if len(allTasks) == 0 { +func (f *FIFOAlgorithm) ScheduleNextTasks(waitingTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { + if len(waitingTasks) == 0 { return nil, nil // No tasks to schedule } // For FIFO, we simply return the available tasks in the order they are provided - first_task := allTasks[0] + for _, task := range waitingTasks { + if task.State != structure.TaskStateWaiting { + continue // Only consider tasks that are in the waiting state + } + if group, exists := taskToWorkerGroupMap[task.ID]; exists && group != "" { + return []*structure.TaskInfo{task}, nil // Return the first task that can be scheduled + } + } - return []*structure.TaskInfo{first_task}, nil + return nil, nil } diff --git a/vermeer/apps/master/schedules/task_manager.go b/vermeer/apps/master/schedules/task_manager.go index 4c3c39181..dd3291d27 100644 --- a/vermeer/apps/master/schedules/task_manager.go +++ b/vermeer/apps/master/schedules/task_manager.go @@ -11,12 +11,12 @@ type TaskManager struct { allTaskMap map[int32]*structure.TaskInfo allTaskQueue []*structure.TaskInfo // A map from task ID to worker group can be used to track which worker group is handling which task. - workerGroupMap map[int32]string + taskToworkerGroupMap map[int32]string } func (t *TaskManager) Init() *TaskManager { t.allTaskMap = make(map[int32]*structure.TaskInfo) - t.workerGroupMap = make(map[int32]string) + t.taskToworkerGroupMap = make(map[int32]string) return t } @@ -40,7 +40,7 @@ func (t *TaskManager) RemoveTask(taskID int32) error { return errors.New("task not found") } delete(t.allTaskMap, taskID) - delete(t.workerGroupMap, taskID) + delete(t.taskToworkerGroupMap, taskID) return nil } @@ -50,7 +50,7 @@ func (t *TaskManager) AssignGroup(taskInfo *structure.TaskInfo) error { if group == "" { return errors.New("failed to assign group for task") } - t.workerGroupMap[taskInfo.ID] = group + t.taskToworkerGroupMap[taskInfo.ID] = group return nil } @@ -83,7 +83,7 @@ func (t *TaskManager) GetAllTasks() []*structure.TaskInfo { return tasks } -func (t *TaskManager) GetAllTasksNotRunning() []*structure.TaskInfo { +func (t *TaskManager) GetAllTasksWaitng() []*structure.TaskInfo { tasks := make([]*structure.TaskInfo, 0, len(t.allTaskMap)) for _, task := range t.allTaskMap { if task.State == structure.TaskStateWaiting { @@ -103,10 +103,10 @@ func (t *TaskManager) GetTasksInQueue(space string) []*structure.TaskInfo { return tasks } -func (t *TaskManager) GetWorkerGroupMap() map[int32]string { +func (t *TaskManager) GetTaskToWorkerGroupMap() map[int32]string { // Return a copy of the worker group map to avoid external modifications - groupMap := make(map[int32]string, len(t.workerGroupMap)) - for k, v := range t.workerGroupMap { + groupMap := make(map[int32]string, len(t.taskToworkerGroupMap)) + for k, v := range t.taskToworkerGroupMap { groupMap[k] = v } return groupMap diff --git a/vermeer/vermeer_test.go b/vermeer/vermeer_test.go index 4dde004d1..e3e4158a7 100644 --- a/vermeer/vermeer_test.go +++ b/vermeer/vermeer_test.go @@ -104,8 +104,9 @@ func testFunction(t *testing.T) { func testAlgorithms(t *testing.T) { // todo: 增加算法名称 - var computeTasks = []string{"pagerank", "lpa", "wcc", "degree_out", "degree_in", "degree_both", "triangle_count", - "sssp", "closeness_centrality", "betweenness_centrality", "kcore", "jaccard", "ppr", "clustering_coefficient", "scc", "louvain"} + // var computeTasks = []string{"pagerank", "lpa", "wcc", "degree_out", "degree_in", "degree_both", "triangle_count", + // "sssp", "closeness_centrality", "betweenness_centrality", "kcore", "jaccard", "ppr", "clustering_coefficient", "scc", "louvain"} + var computeTasks = []string{"pagerank"} startTime := time.Now() expectRes, err := functional.GetExpectRes(expectResPath) From fe3fcdcac5d76fc51c7936b95d36be91ffa506b0 Mon Sep 17 00:00:00 2001 From: ethereal Date: Tue, 22 Jul 2025 19:11:32 +0800 Subject: [PATCH 07/27] feat: a tmp version of optimizing --- vermeer/apps/master/bl/scheduler_bl.go | 11 ++++++++++- vermeer/apps/master/bl/worker_bl.go | 2 +- vermeer/apps/master/schedules/resource_manager.go | 8 ++++++++ 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/vermeer/apps/master/bl/scheduler_bl.go b/vermeer/apps/master/bl/scheduler_bl.go index 6a27c4dd1..a2f0697dc 100644 --- a/vermeer/apps/master/bl/scheduler_bl.go +++ b/vermeer/apps/master/bl/scheduler_bl.go @@ -168,12 +168,21 @@ func (s *ScheduleBl) QueueTask(taskInfo *structure.TaskInfo) (bool, error) { // ******** CloseCurrent ******** -func (s *ScheduleBl) CloseCurrent(taskId int32) error { +func (s *ScheduleBl) CloseCurrent(taskId int32, removeWorkerName ...string) error { // trace tasks need these workers, check if these tasks are available s.taskManager.RemoveTask(taskId) // release the worker group s.resourceManager.ReleaseByTaskID(taskId) + if len(removeWorkerName) > 0 { + workerName := removeWorkerName[0] + if workerName == "" { + return errors.New("the argument `removeWorkerName` is empty") + } + logrus.Infof("removing worker '%s' from resource manager", workerName) + s.ChangeWorkerStatus(workerName, schedules.WorkerOngoingStatusDeleted) + } + logrus.Infof("invoke dispatch when task '%d' is closed", taskId) s.TryScheduleNextTasks() return nil diff --git a/vermeer/apps/master/bl/worker_bl.go b/vermeer/apps/master/bl/worker_bl.go index e14d20c45..651e1e043 100644 --- a/vermeer/apps/master/bl/worker_bl.go +++ b/vermeer/apps/master/bl/worker_bl.go @@ -70,7 +70,7 @@ func (wb *WorkerBl) ReleaseWorker(workerName string) error { //taskInfo.SetErrMsg(fmt.Sprintf("worker %v is offline", workerName)) taskMgr.SetError(taskInfo, fmt.Sprintf("worker %v is offline", workerName)) logrus.Warnf("set task %v status:error", taskInfo.ID) - if err := Scheduler.CloseCurrent(taskInfo.ID); err != nil { + if err := Scheduler.CloseCurrent(taskInfo.ID, workerName); err != nil { logrus.Errorf("failed to close task with ID: %d,err:%v", taskInfo.ID, err) } break diff --git a/vermeer/apps/master/schedules/resource_manager.go b/vermeer/apps/master/schedules/resource_manager.go index c01759e70..8aaaf4e7a 100644 --- a/vermeer/apps/master/schedules/resource_manager.go +++ b/vermeer/apps/master/schedules/resource_manager.go @@ -11,6 +11,7 @@ const ( WorkerOngoingStatusIdle WorkerOngoingStatus = "idle" WorkerOngoingStatusRunning WorkerOngoingStatus = "running" WorkerOngoingStatusPaused WorkerOngoingStatus = "paused" + WorkerOngoingStatusDeleted WorkerOngoingStatus = "deleted" ) type ResourceManager struct { @@ -130,9 +131,16 @@ func (rm *ResourceManager) changeWorkerStatus(workerName string, status WorkerOn rm.availableWorkerGroups[groupName] = false } } + } else if status == WorkerOngoingStatusDeleted { + delete(rm.workerStatus, workerName) + delete(rm.runningWorkerTasks, workerName) + delete(rm.availableWorkerGroups, workerName) } + + // TODO: Other status changes can be handled here if needed } +// TODO: when sync task created, need to alloc worker? func (rm *ResourceManager) ChangeWorkerStatus(workerName string, status WorkerOngoingStatus) { defer rm.Unlock(rm.Lock()) From 3a72f0a84b54650f10ec0b53d6a3b9020f9f695d Mon Sep 17 00:00:00 2001 From: ethereal Date: Fri, 25 Jul 2025 00:55:13 +0800 Subject: [PATCH 08/27] chore: add some algorithms --- vermeer/apps/master/bl/task_bl.go | 23 ++++++ .../master/schedules/algorithm_manager.go | 77 ++++++++++++++++++- vermeer/apps/structure/task.go | 4 + 3 files changed, 102 insertions(+), 2 deletions(-) diff --git a/vermeer/apps/master/bl/task_bl.go b/vermeer/apps/master/bl/task_bl.go index fa3d5b589..7f48e2cd1 100644 --- a/vermeer/apps/master/bl/task_bl.go +++ b/vermeer/apps/master/bl/task_bl.go @@ -21,6 +21,8 @@ import ( "errors" "fmt" "sort" + "strconv" + "strings" "time" "vermeer/apps/compute" @@ -62,6 +64,27 @@ func (tb *TaskBl) CreateTaskInfo( return nil, err } + // for scheduler + if params != nil { + if priority, ok := params["priority"]; ok { + if p, err := strconv.Atoi(priority); err == nil { + taskInfo.Priority = int32(p) + } else { + logrus.Warnf("priority convert to int32 error:%v", err) + } + } + if preorders, ok := params["preorders"]; ok { + preorderList := strings.Split(preorders, ",") + for _, preorder := range preorderList { + if pid, err := strconv.Atoi(preorder); err == nil { + taskInfo.Preorders = append(taskInfo.Preorders, int32(pid)) + } else { + logrus.Warnf("preorder convert to int32 error:%v", err) + } + } + } + } + return taskInfo, nil } diff --git a/vermeer/apps/master/schedules/algorithm_manager.go b/vermeer/apps/master/schedules/algorithm_manager.go index 3c7bfd88e..c9fe59860 100644 --- a/vermeer/apps/master/schedules/algorithm_manager.go +++ b/vermeer/apps/master/schedules/algorithm_manager.go @@ -1,6 +1,9 @@ package schedules -import "vermeer/apps/structure" +import ( + "sort" + "vermeer/apps/structure" +) type Algorithm interface { // Name returns the name of the algorithm @@ -20,7 +23,9 @@ func (am *AlgorithmManager) Init() { am.dispatchPaused = false // Register default algorithms am.RegisterAlgorithm(&FIFOAlgorithm{}) - am.nowAlgorithm = "FIFO" // Default algorithm + am.RegisterAlgorithm(&PriorityAlgorithm{}) + am.RegisterAlgorithm(&DependsAlgorithm{}) + am.nowAlgorithm = "Priority" // Default algorithm } func (am *AlgorithmManager) RegisterAlgorithm(algorithm Algorithm) { @@ -82,3 +87,71 @@ func (f *FIFOAlgorithm) ScheduleNextTasks(waitingTasks []*structure.TaskInfo, ta return nil, nil } + +type PriorityAlgorithm struct{} + +func (p *PriorityAlgorithm) Name() string { + return "Priority" +} + +func (p *PriorityAlgorithm) ScheduleNextTasks(waitingTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { + if len(waitingTasks) == 0 { + return nil, nil // No tasks to schedule + } + + // Sort tasks by priority (higher priority first) + sort.Slice(waitingTasks, func(i, j int) bool { + return waitingTasks[i].Priority > waitingTasks[j].Priority + }) + + for _, task := range waitingTasks { + if task.State != structure.TaskStateWaiting { + continue // Only consider tasks that are in the waiting state + } + if group, exists := taskToWorkerGroupMap[task.ID]; exists && group != "" { + return []*structure.TaskInfo{task}, nil // Return the first task that can be scheduled + } + } + + return nil, nil +} + +type DependsAlgorithm struct{} + +func (d *DependsAlgorithm) Name() string { + return "Depends" +} + +func (d *DependsAlgorithm) ScheduleNextTasks(waitingTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { + if len(waitingTasks) == 0 { + return nil, nil // No tasks to schedule + } + + sort.Slice(waitingTasks, func(i, j int) bool { + return waitingTasks[i].ID < waitingTasks[j].ID + }) + + waitingTaskIDs := make(map[int32]*structure.TaskInfo) + for _, task := range waitingTasks { + waitingTaskIDs[task.ID] = task + } + + for _, task := range waitingTasks { + depends := task.Preorders + // Check if all dependencies are satisfied + allDepsSatisfied := true + for _, dep := range depends { + if depTask, exists := waitingTaskIDs[dep]; !exists || depTask.State != structure.TaskStateWaiting { + allDepsSatisfied = false + break + } + } + if allDepsSatisfied { + if group, exists := taskToWorkerGroupMap[task.ID]; exists && group != "" { + return []*structure.TaskInfo{task}, nil // Return the first task that can be scheduled + } + } + } + + return nil, nil +} diff --git a/vermeer/apps/structure/task.go b/vermeer/apps/structure/task.go index 87356f2bf..2c20cdb29 100644 --- a/vermeer/apps/structure/task.go +++ b/vermeer/apps/structure/task.go @@ -55,6 +55,10 @@ type TaskInfo struct { wg *sync.WaitGroup Action int32 StatisticsResult map[string]any + + // for scheduler + Priority int32 + Preorders []int32 } func (ti *TaskInfo) SetState(state TaskState) { From 0acc01490e0fd0b722f955a1cca4c619842404d2 Mon Sep 17 00:00:00 2001 From: ethereal Date: Sat, 26 Jul 2025 23:50:39 +0800 Subject: [PATCH 09/27] chore: rename the modules; add filter interface --- vermeer/apps/master/bl/scheduler_bl.go | 12 +- .../master/schedules/algorithm_manager.go | 157 ------------ .../schedules/scheduler_algorithm_manager.go | 236 ++++++++++++++++++ ...nager.go => scheduler_resource_manager.go} | 16 +- ...k_manager.go => scheduler_task_manager.go} | 24 +- 5 files changed, 262 insertions(+), 183 deletions(-) delete mode 100644 vermeer/apps/master/schedules/algorithm_manager.go create mode 100644 vermeer/apps/master/schedules/scheduler_algorithm_manager.go rename vermeer/apps/master/schedules/{resource_manager.go => scheduler_resource_manager.go} (84%) rename vermeer/apps/master/schedules/{task_manager.go => scheduler_task_manager.go} (74%) diff --git a/vermeer/apps/master/bl/scheduler_bl.go b/vermeer/apps/master/bl/scheduler_bl.go index a2f0697dc..d531f9934 100644 --- a/vermeer/apps/master/bl/scheduler_bl.go +++ b/vermeer/apps/master/bl/scheduler_bl.go @@ -31,11 +31,11 @@ import ( type ScheduleBl struct { structure.MutexLocker // resource management - resourceManager *schedules.ResourceManager + resourceManager *schedules.SchedulerResourceManager // algorithm management - algorithmManager *schedules.AlgorithmManager + algorithmManager *schedules.SchedulerAlgorithmManager // task management - taskManager *schedules.TaskManager + taskManager *schedules.SchedulerTaskManager // start channel for tasks to be started startChan chan *structure.TaskInfo } @@ -54,11 +54,11 @@ func (s *ScheduleBl) Init() { startChan := make(chan *structure.TaskInfo, chanSizeInt) s.startChan = startChan - s.resourceManager = &schedules.ResourceManager{} + s.resourceManager = &schedules.SchedulerResourceManager{} s.resourceManager.Init() - s.taskManager = &schedules.TaskManager{} + s.taskManager = &schedules.SchedulerTaskManager{} s.taskManager.Init() - s.algorithmManager = &schedules.AlgorithmManager{} + s.algorithmManager = &schedules.SchedulerAlgorithmManager{} s.algorithmManager.Init() go s.startTicker() go s.waitingStartedTask() diff --git a/vermeer/apps/master/schedules/algorithm_manager.go b/vermeer/apps/master/schedules/algorithm_manager.go deleted file mode 100644 index c9fe59860..000000000 --- a/vermeer/apps/master/schedules/algorithm_manager.go +++ /dev/null @@ -1,157 +0,0 @@ -package schedules - -import ( - "sort" - "vermeer/apps/structure" -) - -type Algorithm interface { - // Name returns the name of the algorithm - Name() string - // Execute runs the algorithm with the provided parameters - ScheduleNextTasks(waitingTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) -} - -type AlgorithmManager struct { - supportedAlgorithms map[string]Algorithm - nowAlgorithm string - dispatchPaused bool -} - -func (am *AlgorithmManager) Init() { - am.supportedAlgorithms = make(map[string]Algorithm) - am.dispatchPaused = false - // Register default algorithms - am.RegisterAlgorithm(&FIFOAlgorithm{}) - am.RegisterAlgorithm(&PriorityAlgorithm{}) - am.RegisterAlgorithm(&DependsAlgorithm{}) - am.nowAlgorithm = "Priority" // Default algorithm -} - -func (am *AlgorithmManager) RegisterAlgorithm(algorithm Algorithm) { - if algorithm == nil { - return - } - name := algorithm.Name() - if _, exists := am.supportedAlgorithms[name]; exists { - return // Algorithm already registered - } - am.supportedAlgorithms[name] = algorithm -} - -func (am *AlgorithmManager) IsDispatchPaused() bool { - return am.dispatchPaused -} - -func (am *AlgorithmManager) PauseDispatch() { - am.dispatchPaused = true -} - -func (am *AlgorithmManager) ResumeDispatch() { - am.dispatchPaused = false -} - -func (am *AlgorithmManager) ScheduleNextTasks(waitingTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { - if am.dispatchPaused { - return nil, nil // No tasks to schedule if dispatch is paused - } - - tasks, err := am.supportedAlgorithms[am.nowAlgorithm].ScheduleNextTasks(waitingTasks, taskToWorkerGroupMap, idleWorkers, softSchedule) - if err != nil { - return nil, err - } - - return tasks, nil -} - -type FIFOAlgorithm struct{} - -func (f *FIFOAlgorithm) Name() string { - return "FIFO" -} - -func (f *FIFOAlgorithm) ScheduleNextTasks(waitingTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { - if len(waitingTasks) == 0 { - return nil, nil // No tasks to schedule - } - - // For FIFO, we simply return the available tasks in the order they are provided - for _, task := range waitingTasks { - if task.State != structure.TaskStateWaiting { - continue // Only consider tasks that are in the waiting state - } - if group, exists := taskToWorkerGroupMap[task.ID]; exists && group != "" { - return []*structure.TaskInfo{task}, nil // Return the first task that can be scheduled - } - } - - return nil, nil -} - -type PriorityAlgorithm struct{} - -func (p *PriorityAlgorithm) Name() string { - return "Priority" -} - -func (p *PriorityAlgorithm) ScheduleNextTasks(waitingTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { - if len(waitingTasks) == 0 { - return nil, nil // No tasks to schedule - } - - // Sort tasks by priority (higher priority first) - sort.Slice(waitingTasks, func(i, j int) bool { - return waitingTasks[i].Priority > waitingTasks[j].Priority - }) - - for _, task := range waitingTasks { - if task.State != structure.TaskStateWaiting { - continue // Only consider tasks that are in the waiting state - } - if group, exists := taskToWorkerGroupMap[task.ID]; exists && group != "" { - return []*structure.TaskInfo{task}, nil // Return the first task that can be scheduled - } - } - - return nil, nil -} - -type DependsAlgorithm struct{} - -func (d *DependsAlgorithm) Name() string { - return "Depends" -} - -func (d *DependsAlgorithm) ScheduleNextTasks(waitingTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { - if len(waitingTasks) == 0 { - return nil, nil // No tasks to schedule - } - - sort.Slice(waitingTasks, func(i, j int) bool { - return waitingTasks[i].ID < waitingTasks[j].ID - }) - - waitingTaskIDs := make(map[int32]*structure.TaskInfo) - for _, task := range waitingTasks { - waitingTaskIDs[task.ID] = task - } - - for _, task := range waitingTasks { - depends := task.Preorders - // Check if all dependencies are satisfied - allDepsSatisfied := true - for _, dep := range depends { - if depTask, exists := waitingTaskIDs[dep]; !exists || depTask.State != structure.TaskStateWaiting { - allDepsSatisfied = false - break - } - } - if allDepsSatisfied { - if group, exists := taskToWorkerGroupMap[task.ID]; exists && group != "" { - return []*structure.TaskInfo{task}, nil // Return the first task that can be scheduled - } - } - } - - return nil, nil -} diff --git a/vermeer/apps/master/schedules/scheduler_algorithm_manager.go b/vermeer/apps/master/schedules/scheduler_algorithm_manager.go new file mode 100644 index 000000000..18b4df620 --- /dev/null +++ b/vermeer/apps/master/schedules/scheduler_algorithm_manager.go @@ -0,0 +1,236 @@ +package schedules + +import ( + "sort" + "vermeer/apps/structure" +) + +type SchedulerAlgorithm interface { + // Name returns the name of the SchedulerAlgorithm + Name() string + // FilterNextTasks filters the next tasks to be scheduled based on the provided parameters + FilterNextTasks(waitingTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) + // ScheduleNextTasks schedules the next tasks based on the filtered tasks + ScheduleNextTasks(filteredTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) +} + +type SchedulerAlgorithmManager struct { + filteredSchedulerAlgorithms map[string]SchedulerAlgorithm + schuduledSchedulerAlgorithms map[string]SchedulerAlgorithm + dispatchPaused bool +} + +func (am *SchedulerAlgorithmManager) Init() { + am.filteredSchedulerAlgorithms = make(map[string]SchedulerAlgorithm) + am.schuduledSchedulerAlgorithms = make(map[string]SchedulerAlgorithm) + am.dispatchPaused = false + // Register filter and schedule algorithms + am.RegisterFilterAlgorithm(&DependsSchedulerAlgorithm{}) + // Register default SchedulerAlgorithms + am.RegisterSchedulerAlgorithm(&PrioritySchedulerAlgorithm{}) +} + +func (am *SchedulerAlgorithmManager) RegisterSchedulerAlgorithm(SchedulerAlgorithm SchedulerAlgorithm) { + if SchedulerAlgorithm == nil { + return + } + name := SchedulerAlgorithm.Name() + if _, exists := am.schuduledSchedulerAlgorithms[name]; exists { + return // SchedulerAlgorithm already registered + } + + // only support one scheduling algorithm for now + if len(am.schuduledSchedulerAlgorithms) > 0 { + return // Only one scheduling algorithm can be registered + } + am.schuduledSchedulerAlgorithms[name] = SchedulerAlgorithm +} + +func (am *SchedulerAlgorithmManager) RegisterFilterAlgorithm(SchedulerAlgorithm SchedulerAlgorithm) { + if SchedulerAlgorithm == nil { + return + } + name := SchedulerAlgorithm.Name() + if _, exists := am.filteredSchedulerAlgorithms[name]; exists { + return // SchedulerAlgorithm already registered + } + am.filteredSchedulerAlgorithms[name] = SchedulerAlgorithm +} + +func (am *SchedulerAlgorithmManager) IsDispatchPaused() bool { + return am.dispatchPaused +} + +func (am *SchedulerAlgorithmManager) PauseDispatch() { + am.dispatchPaused = true +} + +func (am *SchedulerAlgorithmManager) ResumeDispatch() { + am.dispatchPaused = false +} + +func (am *SchedulerAlgorithmManager) ScheduleNextTasks(waitingTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { + if am.dispatchPaused { + return nil, nil // No tasks to schedule if dispatch is paused + } + + filteredTasks := waitingTasks + for _, algorithm := range am.filteredSchedulerAlgorithms { + var err error + filteredTasks, err = algorithm.FilterNextTasks(filteredTasks, taskToWorkerGroupMap, idleWorkers, softSchedule) + if err != nil { + return nil, err + } + } + if len(filteredTasks) == 0 { + return nil, nil // No tasks to schedule after filtering + } + + // only support one scheduling algorithm for now + // get first algorithm + for _, algorithm := range am.schuduledSchedulerAlgorithms { + tasks, err := algorithm.ScheduleNextTasks(filteredTasks, taskToWorkerGroupMap, idleWorkers, softSchedule) + if err != nil { + return nil, err + } + return tasks, nil // Return the scheduled tasks + } + + return nil, nil // No tasks scheduled +} + +type FIFOSchedulerAlgorithm struct{} + +func (f *FIFOSchedulerAlgorithm) Name() string { + return "FIFO" +} + +func (f *FIFOSchedulerAlgorithm) FilterNextTasks(waitingTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { + // just return the waiting tasks as is for FIFO + return waitingTasks, nil +} + +func (f *FIFOSchedulerAlgorithm) ScheduleNextTasks(waitingTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { + if len(waitingTasks) == 0 { + return nil, nil // No tasks to schedule + } + + // For FIFO, we simply return the available tasks in the order they are provided + for _, task := range waitingTasks { + if task.State != structure.TaskStateWaiting { + continue // Only consider tasks that are in the waiting state + } + if group, exists := taskToWorkerGroupMap[task.ID]; exists && group != "" { + return []*structure.TaskInfo{task}, nil // Return the first task that can be scheduled + } + } + + return nil, nil +} + +type PrioritySchedulerAlgorithm struct{} + +func (p *PrioritySchedulerAlgorithm) Name() string { + return "Priority" +} + +func (p *PrioritySchedulerAlgorithm) FilterNextTasks(waitingTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { + // just return the waiting tasks as is for Priority + return waitingTasks, nil +} + +func (p *PrioritySchedulerAlgorithm) ScheduleNextTasks(waitingTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { + if len(waitingTasks) == 0 { + return nil, nil // No tasks to schedule + } + + // Sort tasks by priority (higher priority first) + sort.Slice(waitingTasks, func(i, j int) bool { + return waitingTasks[i].Priority > waitingTasks[j].Priority + }) + + for _, task := range waitingTasks { + if task.State != structure.TaskStateWaiting { + continue // Only consider tasks that are in the waiting state + } + if group, exists := taskToWorkerGroupMap[task.ID]; exists && group != "" { + return []*structure.TaskInfo{task}, nil // Return the first task that can be scheduled + } + } + + return nil, nil +} + +type DependsSchedulerAlgorithm struct{} + +func (d *DependsSchedulerAlgorithm) Name() string { + return "Depends" +} + +func (d *DependsSchedulerAlgorithm) FilterNextTasks(waitingTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { + if len(waitingTasks) == 0 { + return nil, nil // No tasks to schedule + } + + sort.Slice(waitingTasks, func(i, j int) bool { + return waitingTasks[i].ID < waitingTasks[j].ID + }) + + waitingTaskIDs := make(map[int32]*structure.TaskInfo) + for _, task := range waitingTasks { + waitingTaskIDs[task.ID] = task + } + + filteredTasks := make([]*structure.TaskInfo, 0) + for _, task := range waitingTasks { + depends := task.Preorders + // Check if all dependencies are satisfied + allDepsSatisfied := true + for _, dep := range depends { + if depTask, exists := waitingTaskIDs[dep]; !exists || depTask.State != structure.TaskStateWaiting { + allDepsSatisfied = false + break + } + } + if allDepsSatisfied { + if group, exists := taskToWorkerGroupMap[task.ID]; exists && group != "" { + filteredTasks = append(filteredTasks, task) // Add to filtered tasks if dependencies are satisfied + } + } + } + return filteredTasks, nil +} + +func (d *DependsSchedulerAlgorithm) ScheduleNextTasks(waitingTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { + if len(waitingTasks) == 0 { + return nil, nil // No tasks to schedule + } + + sort.Slice(waitingTasks, func(i, j int) bool { + return waitingTasks[i].ID < waitingTasks[j].ID + }) + + waitingTaskIDs := make(map[int32]*structure.TaskInfo) + for _, task := range waitingTasks { + waitingTaskIDs[task.ID] = task + } + + for _, task := range waitingTasks { + depends := task.Preorders + // Check if all dependencies are satisfied + allDepsSatisfied := true + for _, dep := range depends { + if depTask, exists := waitingTaskIDs[dep]; !exists || depTask.State != structure.TaskStateWaiting { + allDepsSatisfied = false + break + } + } + if allDepsSatisfied { + if group, exists := taskToWorkerGroupMap[task.ID]; exists && group != "" { + return []*structure.TaskInfo{task}, nil // Return the first task that can be scheduled + } + } + } + + return nil, nil +} diff --git a/vermeer/apps/master/schedules/resource_manager.go b/vermeer/apps/master/schedules/scheduler_resource_manager.go similarity index 84% rename from vermeer/apps/master/schedules/resource_manager.go rename to vermeer/apps/master/schedules/scheduler_resource_manager.go index 8aaaf4e7a..c8adc9412 100644 --- a/vermeer/apps/master/schedules/resource_manager.go +++ b/vermeer/apps/master/schedules/scheduler_resource_manager.go @@ -14,7 +14,7 @@ const ( WorkerOngoingStatusDeleted WorkerOngoingStatus = "deleted" ) -type ResourceManager struct { +type SchedulerResourceManager struct { structure.MutexLocker workerStatus map[string]WorkerOngoingStatus runningWorkerTasks map[string][]int32 // worker ID to list of running task IDs @@ -24,14 +24,14 @@ type ResourceManager struct { broker *Broker } -func (rm *ResourceManager) Init() { +func (rm *SchedulerResourceManager) Init() { rm.workerStatus = make(map[string]WorkerOngoingStatus) rm.runningWorkerTasks = make(map[string][]int32) rm.availableWorkerGroups = make(map[string]bool) rm.broker = new(Broker).Init() } -func (rm *ResourceManager) ReleaseByTaskID(taskID int32) { +func (rm *SchedulerResourceManager) ReleaseByTaskID(taskID int32) { defer rm.Unlock(rm.Lock()) for worker, status := range rm.workerStatus { @@ -53,7 +53,7 @@ func (rm *ResourceManager) ReleaseByTaskID(taskID int32) { } } -func (rm *ResourceManager) isTaskRunningOnWorker(worker string, taskID int32) bool { +func (rm *SchedulerResourceManager) isTaskRunningOnWorker(worker string, taskID int32) bool { if tasks, exists := rm.runningWorkerTasks[worker]; exists { for _, id := range tasks { if id == taskID { @@ -64,7 +64,7 @@ func (rm *ResourceManager) isTaskRunningOnWorker(worker string, taskID int32) bo return false } -func (rm *ResourceManager) GetAgentAndAssignTask(taskInfo *structure.TaskInfo) (*Agent, AgentStatus, error) { +func (rm *SchedulerResourceManager) GetAgentAndAssignTask(taskInfo *structure.TaskInfo) (*Agent, AgentStatus, error) { if taskInfo == nil { return nil, AgentStatusError, errors.New("taskInfo is nil") } @@ -96,7 +96,7 @@ func (rm *ResourceManager) GetAgentAndAssignTask(taskInfo *structure.TaskInfo) ( return agent, status, nil } -func (rm *ResourceManager) GetIdleWorkers() []string { +func (rm *SchedulerResourceManager) GetIdleWorkers() []string { defer rm.Unlock(rm.Lock()) idleWorkers := make([]string, 0) @@ -108,7 +108,7 @@ func (rm *ResourceManager) GetIdleWorkers() []string { return idleWorkers } -func (rm *ResourceManager) changeWorkerStatus(workerName string, status WorkerOngoingStatus) { +func (rm *SchedulerResourceManager) changeWorkerStatus(workerName string, status WorkerOngoingStatus) { rm.workerStatus[workerName] = status if status == WorkerOngoingStatusIdle { @@ -141,7 +141,7 @@ func (rm *ResourceManager) changeWorkerStatus(workerName string, status WorkerOn } // TODO: when sync task created, need to alloc worker? -func (rm *ResourceManager) ChangeWorkerStatus(workerName string, status WorkerOngoingStatus) { +func (rm *SchedulerResourceManager) ChangeWorkerStatus(workerName string, status WorkerOngoingStatus) { defer rm.Unlock(rm.Lock()) rm.changeWorkerStatus(workerName, status) diff --git a/vermeer/apps/master/schedules/task_manager.go b/vermeer/apps/master/schedules/scheduler_task_manager.go similarity index 74% rename from vermeer/apps/master/schedules/task_manager.go rename to vermeer/apps/master/schedules/scheduler_task_manager.go index dd3291d27..0683faaf8 100644 --- a/vermeer/apps/master/schedules/task_manager.go +++ b/vermeer/apps/master/schedules/scheduler_task_manager.go @@ -5,7 +5,7 @@ import ( "vermeer/apps/structure" ) -type TaskManager struct { +type SchedulerTaskManager struct { // This struct is responsible for managing tasks in the scheduling system. // A map from task ID to TaskInfo can be used to track tasks. allTaskMap map[int32]*structure.TaskInfo @@ -14,13 +14,13 @@ type TaskManager struct { taskToworkerGroupMap map[int32]string } -func (t *TaskManager) Init() *TaskManager { +func (t *SchedulerTaskManager) Init() *SchedulerTaskManager { t.allTaskMap = make(map[int32]*structure.TaskInfo) t.taskToworkerGroupMap = make(map[int32]string) return t } -func (t *TaskManager) QueueTask(taskInfo *structure.TaskInfo) (bool, error) { +func (t *SchedulerTaskManager) QueueTask(taskInfo *structure.TaskInfo) (bool, error) { if taskInfo == nil { return false, errors.New("the argument `taskInfo` is nil") } @@ -35,7 +35,7 @@ func (t *TaskManager) QueueTask(taskInfo *structure.TaskInfo) (bool, error) { return true, nil } -func (t *TaskManager) RemoveTask(taskID int32) error { +func (t *SchedulerTaskManager) RemoveTask(taskID int32) error { if _, exists := t.allTaskMap[taskID]; !exists { return errors.New("task not found") } @@ -45,7 +45,7 @@ func (t *TaskManager) RemoveTask(taskID int32) error { } // update or create a task in the task map -func (t *TaskManager) AssignGroup(taskInfo *structure.TaskInfo) error { +func (t *SchedulerTaskManager) AssignGroup(taskInfo *structure.TaskInfo) error { group := workerMgr.ApplyGroup(taskInfo.SpaceName, taskInfo.GraphName) if group == "" { return errors.New("failed to assign group for task") @@ -54,7 +54,7 @@ func (t *TaskManager) AssignGroup(taskInfo *structure.TaskInfo) error { return nil } -func (t *TaskManager) GetTaskByID(taskID int32) (*structure.TaskInfo, error) { +func (t *SchedulerTaskManager) GetTaskByID(taskID int32) (*structure.TaskInfo, error) { task, exists := t.allTaskMap[taskID] if !exists { return nil, errors.New("task not found") @@ -62,7 +62,7 @@ func (t *TaskManager) GetTaskByID(taskID int32) (*structure.TaskInfo, error) { return task, nil } -func (t *TaskManager) GetLastTask(spaceName string) *structure.TaskInfo { +func (t *SchedulerTaskManager) GetLastTask(spaceName string) *structure.TaskInfo { // Implement logic to get the last task in the queue for the given space if len(t.allTaskQueue) == 0 { return nil @@ -75,7 +75,7 @@ func (t *TaskManager) GetLastTask(spaceName string) *structure.TaskInfo { return nil } -func (t *TaskManager) GetAllTasks() []*structure.TaskInfo { +func (t *SchedulerTaskManager) GetAllTasks() []*structure.TaskInfo { tasks := make([]*structure.TaskInfo, 0, len(t.allTaskMap)) for _, task := range t.allTaskMap { tasks = append(tasks, task) @@ -83,7 +83,7 @@ func (t *TaskManager) GetAllTasks() []*structure.TaskInfo { return tasks } -func (t *TaskManager) GetAllTasksWaitng() []*structure.TaskInfo { +func (t *SchedulerTaskManager) GetAllTasksWaitng() []*structure.TaskInfo { tasks := make([]*structure.TaskInfo, 0, len(t.allTaskMap)) for _, task := range t.allTaskMap { if task.State == structure.TaskStateWaiting { @@ -93,7 +93,7 @@ func (t *TaskManager) GetAllTasksWaitng() []*structure.TaskInfo { return tasks } -func (t *TaskManager) GetTasksInQueue(space string) []*structure.TaskInfo { +func (t *SchedulerTaskManager) GetTasksInQueue(space string) []*structure.TaskInfo { tasks := make([]*structure.TaskInfo, 0) for _, task := range t.allTaskQueue { if task.SpaceName == space { @@ -103,7 +103,7 @@ func (t *TaskManager) GetTasksInQueue(space string) []*structure.TaskInfo { return tasks } -func (t *TaskManager) GetTaskToWorkerGroupMap() map[int32]string { +func (t *SchedulerTaskManager) GetTaskToWorkerGroupMap() map[int32]string { // Return a copy of the worker group map to avoid external modifications groupMap := make(map[int32]string, len(t.taskToworkerGroupMap)) for k, v := range t.taskToworkerGroupMap { @@ -112,7 +112,7 @@ func (t *TaskManager) GetTaskToWorkerGroupMap() map[int32]string { return groupMap } -func (t *TaskManager) IsTaskOngoing(taskID int32) bool { +func (t *SchedulerTaskManager) IsTaskOngoing(taskID int32) bool { // Check if the task is currently ongoing task, exists := t.allTaskMap[taskID] if !exists { From 185583cdfae88592f4e485aa1eaea1f884033b45 Mon Sep 17 00:00:00 2001 From: ethereal Date: Wed, 30 Jul 2025 00:58:00 +0800 Subject: [PATCH 10/27] chore: add PriorityElderSchedulerAlgorithm --- vermeer/apps/master/bl/scheduler_bl.go | 12 +- .../schedules/scheduler_algorithm_manager.go | 138 ++++++++++++++---- 2 files changed, 113 insertions(+), 37 deletions(-) diff --git a/vermeer/apps/master/bl/scheduler_bl.go b/vermeer/apps/master/bl/scheduler_bl.go index d531f9934..b949b6ed4 100644 --- a/vermeer/apps/master/bl/scheduler_bl.go +++ b/vermeer/apps/master/bl/scheduler_bl.go @@ -100,17 +100,17 @@ func (s *ScheduleBl) tryScheduleInner(softSchedule bool) error { // step 2: get available resources and tasks logrus.Debugf("scheduling next tasks, softSchedule: %v", softSchedule) idleWorkers := s.resourceManager.GetIdleWorkers() - waitingTasks := s.taskManager.GetAllTasksWaitng() - if len(waitingTasks) == 0 || len(idleWorkers) == 0 { - logrus.Debugf("no available tasks or workers, waitingTasks: %d, idleWorkers: %d", - len(waitingTasks), len(idleWorkers)) + allTasks := s.taskManager.GetAllTasks() + if len(allTasks) == 0 || len(idleWorkers) == 0 { + logrus.Debugf("no available tasks or workers, allTasks: %d, idleWorkers: %d", + len(allTasks), len(idleWorkers)) return nil } - logrus.Debugf("waiting tasks: %d, idle workers: %d", len(waitingTasks), len(idleWorkers)) + logrus.Debugf("all tasks: %d, idle workers: %d", len(allTasks), len(idleWorkers)) // step 3: return the task with the highest priority or small tasks which can be executed immediately taskToWorkerGroupMap := s.taskManager.GetTaskToWorkerGroupMap() - nextTasks, err := s.algorithmManager.ScheduleNextTasks(waitingTasks, taskToWorkerGroupMap, idleWorkers, softSchedule) + nextTasks, err := s.algorithmManager.ScheduleNextTasks(allTasks, taskToWorkerGroupMap, idleWorkers, softSchedule) if err != nil { logrus.Errorf("failed to schedule next tasks: %v", err) return err diff --git a/vermeer/apps/master/schedules/scheduler_algorithm_manager.go b/vermeer/apps/master/schedules/scheduler_algorithm_manager.go index 18b4df620..02414db46 100644 --- a/vermeer/apps/master/schedules/scheduler_algorithm_manager.go +++ b/vermeer/apps/master/schedules/scheduler_algorithm_manager.go @@ -2,6 +2,7 @@ package schedules import ( "sort" + "time" "vermeer/apps/structure" ) @@ -9,7 +10,7 @@ type SchedulerAlgorithm interface { // Name returns the name of the SchedulerAlgorithm Name() string // FilterNextTasks filters the next tasks to be scheduled based on the provided parameters - FilterNextTasks(waitingTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) + FilterNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) // ScheduleNextTasks schedules the next tasks based on the filtered tasks ScheduleNextTasks(filteredTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) } @@ -25,9 +26,10 @@ func (am *SchedulerAlgorithmManager) Init() { am.schuduledSchedulerAlgorithms = make(map[string]SchedulerAlgorithm) am.dispatchPaused = false // Register filter and schedule algorithms + am.RegisterFilterAlgorithm(&WaitingSchedulerAlgorithm{}) am.RegisterFilterAlgorithm(&DependsSchedulerAlgorithm{}) // Register default SchedulerAlgorithms - am.RegisterSchedulerAlgorithm(&PrioritySchedulerAlgorithm{}) + am.RegisterSchedulerAlgorithm(&PriorityElderSchedulerAlgorithm{}) } func (am *SchedulerAlgorithmManager) RegisterSchedulerAlgorithm(SchedulerAlgorithm SchedulerAlgorithm) { @@ -69,12 +71,12 @@ func (am *SchedulerAlgorithmManager) ResumeDispatch() { am.dispatchPaused = false } -func (am *SchedulerAlgorithmManager) ScheduleNextTasks(waitingTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { +func (am *SchedulerAlgorithmManager) ScheduleNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { if am.dispatchPaused { return nil, nil // No tasks to schedule if dispatch is paused } - filteredTasks := waitingTasks + filteredTasks := allTasks for _, algorithm := range am.filteredSchedulerAlgorithms { var err error filteredTasks, err = algorithm.FilterNextTasks(filteredTasks, taskToWorkerGroupMap, idleWorkers, softSchedule) @@ -105,18 +107,18 @@ func (f *FIFOSchedulerAlgorithm) Name() string { return "FIFO" } -func (f *FIFOSchedulerAlgorithm) FilterNextTasks(waitingTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { +func (f *FIFOSchedulerAlgorithm) FilterNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { // just return the waiting tasks as is for FIFO - return waitingTasks, nil + return allTasks, nil } -func (f *FIFOSchedulerAlgorithm) ScheduleNextTasks(waitingTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { - if len(waitingTasks) == 0 { +func (f *FIFOSchedulerAlgorithm) ScheduleNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { + if len(allTasks) == 0 { return nil, nil // No tasks to schedule } // For FIFO, we simply return the available tasks in the order they are provided - for _, task := range waitingTasks { + for _, task := range allTasks { if task.State != structure.TaskStateWaiting { continue // Only consider tasks that are in the waiting state } @@ -134,22 +136,22 @@ func (p *PrioritySchedulerAlgorithm) Name() string { return "Priority" } -func (p *PrioritySchedulerAlgorithm) FilterNextTasks(waitingTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { +func (p *PrioritySchedulerAlgorithm) FilterNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { // just return the waiting tasks as is for Priority - return waitingTasks, nil + return allTasks, nil } -func (p *PrioritySchedulerAlgorithm) ScheduleNextTasks(waitingTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { - if len(waitingTasks) == 0 { +func (p *PrioritySchedulerAlgorithm) ScheduleNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { + if len(allTasks) == 0 { return nil, nil // No tasks to schedule } // Sort tasks by priority (higher priority first) - sort.Slice(waitingTasks, func(i, j int) bool { - return waitingTasks[i].Priority > waitingTasks[j].Priority + sort.Slice(allTasks, func(i, j int) bool { + return allTasks[i].Priority > allTasks[j].Priority }) - for _, task := range waitingTasks { + for _, task := range allTasks { if task.State != structure.TaskStateWaiting { continue // Only consider tasks that are in the waiting state } @@ -161,28 +163,102 @@ func (p *PrioritySchedulerAlgorithm) ScheduleNextTasks(waitingTasks []*structure return nil, nil } +type PriorityElderSchedulerAlgorithm struct{} + +func (p *PriorityElderSchedulerAlgorithm) Name() string { + return "PriorityElder" +} + +func (p *PriorityElderSchedulerAlgorithm) FilterNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { + // just return the waiting tasks as is for PriorityElder + return allTasks, nil +} + +func (p *PriorityElderSchedulerAlgorithm) CalculateTaskEmergency(task *structure.TaskInfo, taskToWorkerGroupMap map[int32]string) int64 { + // step 1: age + ageCost := time.Since(task.CreateTime).Milliseconds() / 1000 // in seconds + // step 2: priority + priorityCost := int64(task.Priority) + // step 3: resource cost + gm := structure.GraphManager + resourceCost := 1 / gm.GetGraphByName(task.SpaceName, task.GraphName).VertexCount + // step 4: some random value + randomValue := int64(1) // Placeholder for any random value logic + return ageCost + priorityCost + resourceCost + randomValue +} + +func (p *PriorityElderSchedulerAlgorithm) ScheduleNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { + if len(allTasks) == 0 { + return nil, nil // No tasks to schedule + } + + // Sort tasks by priority (higher priority first) + sort.Slice(allTasks, func(i, j int) bool { + return p.CalculateTaskEmergency(allTasks[i], taskToWorkerGroupMap) > p.CalculateTaskEmergency(allTasks[j], taskToWorkerGroupMap) + }) + + for _, task := range allTasks { + if task.State != structure.TaskStateWaiting { + continue // Only consider tasks that are in the waiting state + } + if group, exists := taskToWorkerGroupMap[task.ID]; exists && group != "" { + return []*structure.TaskInfo{task}, nil // Return the first task that can be scheduled + } + } + + return nil, nil +} + +type WaitingSchedulerAlgorithm struct{} + +func (w *WaitingSchedulerAlgorithm) Name() string { + return "Waiting" +} + +func (w *WaitingSchedulerAlgorithm) FilterNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { + waitingTasks := make([]*structure.TaskInfo, 0) + for _, task := range allTasks { + if task.State == structure.TaskStateWaiting { + waitingTasks = append(waitingTasks, task) + } + } + return waitingTasks, nil +} + +func (w *WaitingSchedulerAlgorithm) ScheduleNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { + waitingTasks, err := w.FilterNextTasks(allTasks, taskToWorkerGroupMap, idleWorkers, softSchedule) + if err != nil { + return nil, err + } + if len(waitingTasks) == 0 { + return nil, nil + } + // For waiting tasks, we simply return them as is + return waitingTasks, nil +} + type DependsSchedulerAlgorithm struct{} func (d *DependsSchedulerAlgorithm) Name() string { return "Depends" } -func (d *DependsSchedulerAlgorithm) FilterNextTasks(waitingTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { - if len(waitingTasks) == 0 { +func (d *DependsSchedulerAlgorithm) FilterNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { + if len(allTasks) == 0 { return nil, nil // No tasks to schedule } - sort.Slice(waitingTasks, func(i, j int) bool { - return waitingTasks[i].ID < waitingTasks[j].ID + sort.Slice(allTasks, func(i, j int) bool { + return allTasks[i].ID < allTasks[j].ID }) waitingTaskIDs := make(map[int32]*structure.TaskInfo) - for _, task := range waitingTasks { + for _, task := range allTasks { waitingTaskIDs[task.ID] = task } filteredTasks := make([]*structure.TaskInfo, 0) - for _, task := range waitingTasks { + for _, task := range allTasks { depends := task.Preorders // Check if all dependencies are satisfied allDepsSatisfied := true @@ -201,26 +277,26 @@ func (d *DependsSchedulerAlgorithm) FilterNextTasks(waitingTasks []*structure.Ta return filteredTasks, nil } -func (d *DependsSchedulerAlgorithm) ScheduleNextTasks(waitingTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { - if len(waitingTasks) == 0 { +func (d *DependsSchedulerAlgorithm) ScheduleNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { + if len(allTasks) == 0 { return nil, nil // No tasks to schedule } - sort.Slice(waitingTasks, func(i, j int) bool { - return waitingTasks[i].ID < waitingTasks[j].ID + sort.Slice(allTasks, func(i, j int) bool { + return allTasks[i].ID < allTasks[j].ID }) - waitingTaskIDs := make(map[int32]*structure.TaskInfo) - for _, task := range waitingTasks { - waitingTaskIDs[task.ID] = task + allTaskIDs := make(map[int32]*structure.TaskInfo) + for _, task := range allTasks { + allTaskIDs[task.ID] = task } - for _, task := range waitingTasks { + for _, task := range allTasks { depends := task.Preorders // Check if all dependencies are satisfied allDepsSatisfied := true for _, dep := range depends { - if depTask, exists := waitingTaskIDs[dep]; !exists || depTask.State != structure.TaskStateWaiting { + if depTask, exists := allTaskIDs[dep]; !exists || depTask.State != structure.TaskStateWaiting { allDepsSatisfied = false break } From 6f4b42ebd88cc5ac905c136cc466a59728dd4f86 Mon Sep 17 00:00:00 2001 From: ethereal Date: Thu, 31 Jul 2025 00:37:38 +0800 Subject: [PATCH 11/27] chore: check PriorityElderSchedulerAlgorithm --- vermeer/apps/master/bl/task_canceler.go | 3 +++ .../schedules/scheduler_algorithm_manager.go | 20 ++++++++++++++----- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/vermeer/apps/master/bl/task_canceler.go b/vermeer/apps/master/bl/task_canceler.go index 7932c7805..b86cea597 100644 --- a/vermeer/apps/master/bl/task_canceler.go +++ b/vermeer/apps/master/bl/task_canceler.go @@ -22,6 +22,7 @@ import ( "fmt" "sync" "time" + "vermeer/apps/master/schedules" pb "vermeer/apps/protos" "vermeer/apps/structure" @@ -158,6 +159,8 @@ func (bc *baseCanceler) doCancelTask() (isContinue bool, err error) { canceled = false return } + // set worker state to idle + Scheduler.ChangeWorkerStatus(workerName, schedules.WorkerOngoingStatusIdle) }(workerName) } wg.Wait() diff --git a/vermeer/apps/master/schedules/scheduler_algorithm_manager.go b/vermeer/apps/master/schedules/scheduler_algorithm_manager.go index 02414db46..717d9c831 100644 --- a/vermeer/apps/master/schedules/scheduler_algorithm_manager.go +++ b/vermeer/apps/master/schedules/scheduler_algorithm_manager.go @@ -4,6 +4,8 @@ import ( "sort" "time" "vermeer/apps/structure" + + "github.com/sirupsen/logrus" ) type SchedulerAlgorithm interface { @@ -175,15 +177,21 @@ func (p *PriorityElderSchedulerAlgorithm) FilterNextTasks(allTasks []*structure. } func (p *PriorityElderSchedulerAlgorithm) CalculateTaskEmergency(task *structure.TaskInfo, taskToWorkerGroupMap map[int32]string) int64 { + // step 0: get params + ageParam := int64(1) + priorityParam := int64(1) + resourceParam := int64(1e10) + randomValueParam := int64(1) // step 1: age - ageCost := time.Since(task.CreateTime).Milliseconds() / 1000 // in seconds + ageCost := ageParam * time.Since(task.CreateTime).Milliseconds() / 1000 // in seconds // step 2: priority - priorityCost := int64(task.Priority) + priorityCost := priorityParam * int64(task.Priority) // step 3: resource cost - gm := structure.GraphManager - resourceCost := 1 / gm.GetGraphByName(task.SpaceName, task.GraphName).VertexCount + graph := structure.GraphManager.GetGraphByName(task.SpaceName, task.GraphName) + resourceCost := resourceParam / max(1, graph.VertexCount+graph.EdgeCount) // Avoid division by zero, ensure at least 1 // step 4: some random value - randomValue := int64(1) // Placeholder for any random value logic + randomValue := int64(randomValueParam) // Placeholder for any random value logic + logrus.Debugf("Task %d: Age Cost: %d, Priority Cost: %d, Resource Cost: %d, Random Value: %d", task.ID, ageCost, priorityCost, resourceCost, randomValue) return ageCost + priorityCost + resourceCost + randomValue } @@ -202,6 +210,8 @@ func (p *PriorityElderSchedulerAlgorithm) ScheduleNextTasks(allTasks []*structur continue // Only consider tasks that are in the waiting state } if group, exists := taskToWorkerGroupMap[task.ID]; exists && group != "" { + logrus.Debugf("Task %d is assigned to worker group %s", task.ID, group) + // logrus.Debugf("Task %d is scheduled with emergency value %d", task.ID, p.CalculateTaskEmergency(task, taskToWorkerGroupMap)) return []*structure.TaskInfo{task}, nil // Return the first task that can be scheduled } } From a61a886c071c65ba3f4ce46dba55965efd00d042 Mon Sep 17 00:00:00 2001 From: ethereal Date: Thu, 31 Jul 2025 14:23:58 +0800 Subject: [PATCH 12/27] chore: check PriorityElderSchedulerAlgorithm --- vermeer/apps/master/bl/scheduler_bl.go | 16 ++- vermeer/apps/master/bl/task_bl.go | 10 ++ vermeer/apps/master/bl/task_canceler.go | 3 - vermeer/apps/master/schedules/broker.go | 14 ++- .../schedules/scheduler_algorithm_manager.go | 62 ++++++---- .../schedules/scheduler_resource_manager.go | 110 +++++++++++------- vermeer/apps/structure/task.go | 1 + 7 files changed, 140 insertions(+), 76 deletions(-) diff --git a/vermeer/apps/master/bl/scheduler_bl.go b/vermeer/apps/master/bl/scheduler_bl.go index b949b6ed4..dc66b7c0f 100644 --- a/vermeer/apps/master/bl/scheduler_bl.go +++ b/vermeer/apps/master/bl/scheduler_bl.go @@ -99,18 +99,19 @@ func (s *ScheduleBl) tryScheduleInner(softSchedule bool) error { // step 2: get available resources and tasks logrus.Debugf("scheduling next tasks, softSchedule: %v", softSchedule) - idleWorkers := s.resourceManager.GetIdleWorkers() + idleWorkerGroups := s.resourceManager.GetIdleWorkerGroups() + concurrentWorkerGroups := s.resourceManager.GetConcurrentWorkerGroups() allTasks := s.taskManager.GetAllTasks() - if len(allTasks) == 0 || len(idleWorkers) == 0 { - logrus.Debugf("no available tasks or workers, allTasks: %d, idleWorkers: %d", - len(allTasks), len(idleWorkers)) + if len(allTasks) == 0 || (len(idleWorkerGroups) == 0 && len(concurrentWorkerGroups) == 0) { + logrus.Debugf("no available tasks or workerGroups, allTasks: %d, workerGroups: %d/%d", + len(allTasks), len(idleWorkerGroups), len(concurrentWorkerGroups)) return nil } - logrus.Debugf("all tasks: %d, idle workers: %d", len(allTasks), len(idleWorkers)) + logrus.Debugf("all tasks: %d, workerGroups: %d/%d", len(allTasks), len(idleWorkerGroups), len(concurrentWorkerGroups)) // step 3: return the task with the highest priority or small tasks which can be executed immediately taskToWorkerGroupMap := s.taskManager.GetTaskToWorkerGroupMap() - nextTasks, err := s.algorithmManager.ScheduleNextTasks(allTasks, taskToWorkerGroupMap, idleWorkers, softSchedule) + nextTasks, err := s.algorithmManager.ScheduleNextTasks(allTasks, taskToWorkerGroupMap, idleWorkerGroups, concurrentWorkerGroups, softSchedule) if err != nil { logrus.Errorf("failed to schedule next tasks: %v", err) return err @@ -322,6 +323,9 @@ func (s *ScheduleBl) handleCancelTask(taskInfo *structure.TaskInfo) error { return err } + // set worker state to idle or concurrent running + s.resourceManager.ReleaseByTaskID(taskInfo.ID) + return nil } diff --git a/vermeer/apps/master/bl/task_bl.go b/vermeer/apps/master/bl/task_bl.go index 7f48e2cd1..a8b6e50ce 100644 --- a/vermeer/apps/master/bl/task_bl.go +++ b/vermeer/apps/master/bl/task_bl.go @@ -65,6 +65,9 @@ func (tb *TaskBl) CreateTaskInfo( } // for scheduler + taskInfo.Priority = 0 + taskInfo.Preorders = make([]int32, 0) + taskInfo.Exclusive = false // default to false, can be set to true if needed if params != nil { if priority, ok := params["priority"]; ok { if p, err := strconv.Atoi(priority); err == nil { @@ -83,6 +86,13 @@ func (tb *TaskBl) CreateTaskInfo( } } } + if exclusive, ok := params["exclusive"]; ok { + if ex, err := strconv.ParseBool(exclusive); err == nil { + taskInfo.Exclusive = ex + } else { + logrus.Warnf("exclusive convert to bool error:%v", err) + } + } } return taskInfo, nil diff --git a/vermeer/apps/master/bl/task_canceler.go b/vermeer/apps/master/bl/task_canceler.go index b86cea597..7932c7805 100644 --- a/vermeer/apps/master/bl/task_canceler.go +++ b/vermeer/apps/master/bl/task_canceler.go @@ -22,7 +22,6 @@ import ( "fmt" "sync" "time" - "vermeer/apps/master/schedules" pb "vermeer/apps/protos" "vermeer/apps/structure" @@ -159,8 +158,6 @@ func (bc *baseCanceler) doCancelTask() (isContinue bool, err error) { canceled = false return } - // set worker state to idle - Scheduler.ChangeWorkerStatus(workerName, schedules.WorkerOngoingStatusIdle) }(workerName) } wg.Wait() diff --git a/vermeer/apps/master/schedules/broker.go b/vermeer/apps/master/schedules/broker.go index b44ca18b5..7cecac75e 100644 --- a/vermeer/apps/master/schedules/broker.go +++ b/vermeer/apps/master/schedules/broker.go @@ -73,7 +73,7 @@ func (b *Broker) AllAgents() []*Agent { return res } -func (b *Broker) ApplyAgent(taskInfo *structure.TaskInfo) (*Agent, AgentStatus, map[string]*workers.WorkerClient, error) { +func (b *Broker) ApplyAgent(taskInfo *structure.TaskInfo, forceApply ...bool) (*Agent, AgentStatus, map[string]*workers.WorkerClient, error) { if taskInfo == nil { return nil, AgentStatusError, nil, fmt.Errorf("taskInfo is nil") } @@ -98,12 +98,14 @@ func (b *Broker) ApplyAgent(taskInfo *structure.TaskInfo) (*Agent, AgentStatus, return nil, AgentStatusWorkerNotReady, nil, nil } - if b.isAgentBusy(agent) { - return nil, AgentStatusAgentBusy, nil, nil - } + if !(forceApply != nil && len(forceApply) > 0 && forceApply[0]) { + if b.isAgentBusy(agent) { + return nil, AgentStatusAgentBusy, nil, nil + } - if b.isWorkerBusy(workers, agent) { - return nil, AgentStatusWorkerBusy, nil, nil + if b.isWorkerBusy(workers, agent) { + return nil, AgentStatusWorkerBusy, nil, nil + } } agent.AssignTask(taskInfo) diff --git a/vermeer/apps/master/schedules/scheduler_algorithm_manager.go b/vermeer/apps/master/schedules/scheduler_algorithm_manager.go index 717d9c831..5778d1d3e 100644 --- a/vermeer/apps/master/schedules/scheduler_algorithm_manager.go +++ b/vermeer/apps/master/schedules/scheduler_algorithm_manager.go @@ -12,9 +12,9 @@ type SchedulerAlgorithm interface { // Name returns the name of the SchedulerAlgorithm Name() string // FilterNextTasks filters the next tasks to be scheduled based on the provided parameters - FilterNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) + FilterNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkerGroups []string, concurrentWorkerGroups []string, softSchedule bool) ([]*structure.TaskInfo, error) // ScheduleNextTasks schedules the next tasks based on the filtered tasks - ScheduleNextTasks(filteredTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) + ScheduleNextTasks(filteredTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkerGroups []string, concurrentWorkerGroups []string, softSchedule bool) ([]*structure.TaskInfo, error) } type SchedulerAlgorithmManager struct { @@ -73,7 +73,7 @@ func (am *SchedulerAlgorithmManager) ResumeDispatch() { am.dispatchPaused = false } -func (am *SchedulerAlgorithmManager) ScheduleNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { +func (am *SchedulerAlgorithmManager) ScheduleNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkerGroups []string, concurrentWorkerGroups []string, softSchedule bool) ([]*structure.TaskInfo, error) { if am.dispatchPaused { return nil, nil // No tasks to schedule if dispatch is paused } @@ -81,7 +81,7 @@ func (am *SchedulerAlgorithmManager) ScheduleNextTasks(allTasks []*structure.Tas filteredTasks := allTasks for _, algorithm := range am.filteredSchedulerAlgorithms { var err error - filteredTasks, err = algorithm.FilterNextTasks(filteredTasks, taskToWorkerGroupMap, idleWorkers, softSchedule) + filteredTasks, err = algorithm.FilterNextTasks(filteredTasks, taskToWorkerGroupMap, idleWorkerGroups, concurrentWorkerGroups, softSchedule) if err != nil { return nil, err } @@ -93,7 +93,7 @@ func (am *SchedulerAlgorithmManager) ScheduleNextTasks(allTasks []*structure.Tas // only support one scheduling algorithm for now // get first algorithm for _, algorithm := range am.schuduledSchedulerAlgorithms { - tasks, err := algorithm.ScheduleNextTasks(filteredTasks, taskToWorkerGroupMap, idleWorkers, softSchedule) + tasks, err := algorithm.ScheduleNextTasks(filteredTasks, taskToWorkerGroupMap, idleWorkerGroups, concurrentWorkerGroups, softSchedule) if err != nil { return nil, err } @@ -109,12 +109,12 @@ func (f *FIFOSchedulerAlgorithm) Name() string { return "FIFO" } -func (f *FIFOSchedulerAlgorithm) FilterNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { +func (f *FIFOSchedulerAlgorithm) FilterNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkerGroups []string, concurrentWorkerGroups []string, softSchedule bool) ([]*structure.TaskInfo, error) { // just return the waiting tasks as is for FIFO return allTasks, nil } -func (f *FIFOSchedulerAlgorithm) ScheduleNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { +func (f *FIFOSchedulerAlgorithm) ScheduleNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkerGroups []string, concurrentWorkerGroups []string, softSchedule bool) ([]*structure.TaskInfo, error) { if len(allTasks) == 0 { return nil, nil // No tasks to schedule } @@ -125,7 +125,12 @@ func (f *FIFOSchedulerAlgorithm) ScheduleNextTasks(allTasks []*structure.TaskInf continue // Only consider tasks that are in the waiting state } if group, exists := taskToWorkerGroupMap[task.ID]; exists && group != "" { - return []*structure.TaskInfo{task}, nil // Return the first task that can be scheduled + for _, idleGroup := range idleWorkerGroups { + if group == idleGroup { + logrus.Debugf("Task %d is assigned to worker group %s", task.ID, group) + return []*structure.TaskInfo{task}, nil // Return the first task that can be scheduled + } + } } } @@ -138,12 +143,12 @@ func (p *PrioritySchedulerAlgorithm) Name() string { return "Priority" } -func (p *PrioritySchedulerAlgorithm) FilterNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { +func (p *PrioritySchedulerAlgorithm) FilterNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkerGroups []string, concurrentWorkerGroups []string, softSchedule bool) ([]*structure.TaskInfo, error) { // just return the waiting tasks as is for Priority return allTasks, nil } -func (p *PrioritySchedulerAlgorithm) ScheduleNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { +func (p *PrioritySchedulerAlgorithm) ScheduleNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkerGroups []string, concurrentWorkerGroups []string, softSchedule bool) ([]*structure.TaskInfo, error) { if len(allTasks) == 0 { return nil, nil // No tasks to schedule } @@ -158,7 +163,12 @@ func (p *PrioritySchedulerAlgorithm) ScheduleNextTasks(allTasks []*structure.Tas continue // Only consider tasks that are in the waiting state } if group, exists := taskToWorkerGroupMap[task.ID]; exists && group != "" { - return []*structure.TaskInfo{task}, nil // Return the first task that can be scheduled + for _, idleGroup := range idleWorkerGroups { + if group == idleGroup { + logrus.Debugf("Task %d is assigned to worker group %s", task.ID, group) + return []*structure.TaskInfo{task}, nil // Return the first task that can be scheduled + } + } } } @@ -171,7 +181,7 @@ func (p *PriorityElderSchedulerAlgorithm) Name() string { return "PriorityElder" } -func (p *PriorityElderSchedulerAlgorithm) FilterNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { +func (p *PriorityElderSchedulerAlgorithm) FilterNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkerGroups []string, concurrentWorkerGroups []string, softSchedule bool) ([]*structure.TaskInfo, error) { // just return the waiting tasks as is for PriorityElder return allTasks, nil } @@ -195,7 +205,7 @@ func (p *PriorityElderSchedulerAlgorithm) CalculateTaskEmergency(task *structure return ageCost + priorityCost + resourceCost + randomValue } -func (p *PriorityElderSchedulerAlgorithm) ScheduleNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { +func (p *PriorityElderSchedulerAlgorithm) ScheduleNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkerGroups []string, concurrentWorkerGroups []string, softSchedule bool) ([]*structure.TaskInfo, error) { if len(allTasks) == 0 { return nil, nil // No tasks to schedule } @@ -210,9 +220,12 @@ func (p *PriorityElderSchedulerAlgorithm) ScheduleNextTasks(allTasks []*structur continue // Only consider tasks that are in the waiting state } if group, exists := taskToWorkerGroupMap[task.ID]; exists && group != "" { - logrus.Debugf("Task %d is assigned to worker group %s", task.ID, group) - // logrus.Debugf("Task %d is scheduled with emergency value %d", task.ID, p.CalculateTaskEmergency(task, taskToWorkerGroupMap)) - return []*structure.TaskInfo{task}, nil // Return the first task that can be scheduled + for _, idleGroup := range idleWorkerGroups { + if group == idleGroup { + logrus.Debugf("Task %d is assigned to worker group %s", task.ID, group) + return []*structure.TaskInfo{task}, nil // Return the first task that can be scheduled + } + } } } @@ -225,7 +238,7 @@ func (w *WaitingSchedulerAlgorithm) Name() string { return "Waiting" } -func (w *WaitingSchedulerAlgorithm) FilterNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { +func (w *WaitingSchedulerAlgorithm) FilterNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkerGroups []string, concurrentWorkerGroups []string, softSchedule bool) ([]*structure.TaskInfo, error) { waitingTasks := make([]*structure.TaskInfo, 0) for _, task := range allTasks { if task.State == structure.TaskStateWaiting { @@ -235,8 +248,8 @@ func (w *WaitingSchedulerAlgorithm) FilterNextTasks(allTasks []*structure.TaskIn return waitingTasks, nil } -func (w *WaitingSchedulerAlgorithm) ScheduleNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { - waitingTasks, err := w.FilterNextTasks(allTasks, taskToWorkerGroupMap, idleWorkers, softSchedule) +func (w *WaitingSchedulerAlgorithm) ScheduleNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkerGroups []string, concurrentWorkerGroups []string, softSchedule bool) ([]*structure.TaskInfo, error) { + waitingTasks, err := w.FilterNextTasks(allTasks, taskToWorkerGroupMap, idleWorkerGroups, concurrentWorkerGroups, softSchedule) if err != nil { return nil, err } @@ -253,7 +266,7 @@ func (d *DependsSchedulerAlgorithm) Name() string { return "Depends" } -func (d *DependsSchedulerAlgorithm) FilterNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { +func (d *DependsSchedulerAlgorithm) FilterNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkerGroups []string, concurrentWorkerGroups []string, softSchedule bool) ([]*structure.TaskInfo, error) { if len(allTasks) == 0 { return nil, nil // No tasks to schedule } @@ -287,7 +300,7 @@ func (d *DependsSchedulerAlgorithm) FilterNextTasks(allTasks []*structure.TaskIn return filteredTasks, nil } -func (d *DependsSchedulerAlgorithm) ScheduleNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkers []string, softSchedule bool) ([]*structure.TaskInfo, error) { +func (d *DependsSchedulerAlgorithm) ScheduleNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkerGroups []string, concurrentWorkerGroups []string, softSchedule bool) ([]*structure.TaskInfo, error) { if len(allTasks) == 0 { return nil, nil // No tasks to schedule } @@ -313,7 +326,12 @@ func (d *DependsSchedulerAlgorithm) ScheduleNextTasks(allTasks []*structure.Task } if allDepsSatisfied { if group, exists := taskToWorkerGroupMap[task.ID]; exists && group != "" { - return []*structure.TaskInfo{task}, nil // Return the first task that can be scheduled + for _, idleGroup := range idleWorkerGroups { + if group == idleGroup { + logrus.Debugf("Task %d is assigned to worker group %s", task.ID, group) + return []*structure.TaskInfo{task}, nil // Return the first task that can be scheduled + } + } } } } diff --git a/vermeer/apps/master/schedules/scheduler_resource_manager.go b/vermeer/apps/master/schedules/scheduler_resource_manager.go index c8adc9412..37cc2e9a9 100644 --- a/vermeer/apps/master/schedules/scheduler_resource_manager.go +++ b/vermeer/apps/master/schedules/scheduler_resource_manager.go @@ -3,22 +3,25 @@ package schedules import ( "errors" "vermeer/apps/structure" + + "github.com/sirupsen/logrus" ) type WorkerOngoingStatus string const ( - WorkerOngoingStatusIdle WorkerOngoingStatus = "idle" - WorkerOngoingStatusRunning WorkerOngoingStatus = "running" - WorkerOngoingStatusPaused WorkerOngoingStatus = "paused" - WorkerOngoingStatusDeleted WorkerOngoingStatus = "deleted" + WorkerOngoingStatusIdle WorkerOngoingStatus = "idle" + WorkerOngoingStatusRunning WorkerOngoingStatus = "running" + WorkerOngoingStatusConcurrentRunning WorkerOngoingStatus = "concurrent_running" + WorkerOngoingStatusPaused WorkerOngoingStatus = "paused" + WorkerOngoingStatusDeleted WorkerOngoingStatus = "deleted" ) type SchedulerResourceManager struct { structure.MutexLocker - workerStatus map[string]WorkerOngoingStatus - runningWorkerTasks map[string][]int32 // worker ID to list of running task IDs - availableWorkerGroups map[string]bool // worker group name to availability status + workerStatus map[string]WorkerOngoingStatus + workerGroupStatus map[string]WorkerOngoingStatus + runningWorkerGroupTasks map[string][]int32 // worker group name to list of running task IDs // broker just responsible for communication with workers // it can not apply tasks to workers directly broker *Broker @@ -26,35 +29,43 @@ type SchedulerResourceManager struct { func (rm *SchedulerResourceManager) Init() { rm.workerStatus = make(map[string]WorkerOngoingStatus) - rm.runningWorkerTasks = make(map[string][]int32) - rm.availableWorkerGroups = make(map[string]bool) + rm.workerGroupStatus = make(map[string]WorkerOngoingStatus) + rm.runningWorkerGroupTasks = make(map[string][]int32) rm.broker = new(Broker).Init() } func (rm *SchedulerResourceManager) ReleaseByTaskID(taskID int32) { defer rm.Unlock(rm.Lock()) - for worker, status := range rm.workerStatus { - if status == WorkerOngoingStatusRunning && rm.isTaskRunningOnWorker(worker, taskID) { - delete(rm.workerStatus, worker) - if tasks, exists := rm.runningWorkerTasks[worker]; exists { + for workerGroup, status := range rm.workerGroupStatus { + if (status == WorkerOngoingStatusRunning || status == WorkerOngoingStatusConcurrentRunning) && rm.isTaskRunningOnWorkerGroup(workerGroup, taskID) { + delete(rm.workerGroupStatus, workerGroup) + if tasks, exists := rm.runningWorkerGroupTasks[workerGroup]; exists { for i, id := range tasks { if id == taskID { - rm.runningWorkerTasks[worker] = append(tasks[:i], tasks[i+1:]...) - if len(rm.runningWorkerTasks[worker]) == 0 { - delete(rm.runningWorkerTasks, worker) + rm.runningWorkerGroupTasks[workerGroup] = append(tasks[:i], tasks[i+1:]...) + if len(rm.runningWorkerGroupTasks[workerGroup]) == 0 { + delete(rm.runningWorkerGroupTasks, workerGroup) } break } } } - rm.changeWorkerStatus(worker, WorkerOngoingStatusIdle) + if tasks, exists := rm.runningWorkerGroupTasks[workerGroup]; !exists || len(tasks) == 0 { + for _, worker := range workerMgr.GetGroupWorkers(workerGroup) { + rm.changeWorkerStatus(worker.Name, WorkerOngoingStatusIdle) + } + } else { + for _, worker := range workerMgr.GetGroupWorkers(workerGroup) { + rm.changeWorkerStatus(worker.Name, WorkerOngoingStatusConcurrentRunning) + } + } } } } -func (rm *SchedulerResourceManager) isTaskRunningOnWorker(worker string, taskID int32) bool { - if tasks, exists := rm.runningWorkerTasks[worker]; exists { +func (rm *SchedulerResourceManager) isTaskRunningOnWorkerGroup(workerGroup string, taskID int32) bool { + if tasks, exists := rm.runningWorkerGroupTasks[workerGroup]; exists { for _, id := range tasks { if id == taskID { return true @@ -82,64 +93,85 @@ func (rm *SchedulerResourceManager) GetAgentAndAssignTask(taskInfo *structure.Ta // Assign the task to the agent agent.AssignTask(taskInfo) + runningStatus := WorkerOngoingStatusRunning + if _, exists := rm.runningWorkerGroupTasks[agent.GroupName()]; !exists { + rm.runningWorkerGroupTasks[agent.GroupName()] = []int32{} + runningStatus = WorkerOngoingStatusRunning + rm.workerGroupStatus[agent.GroupName()] = runningStatus + } else { + runningStatus = WorkerOngoingStatusConcurrentRunning + rm.workerGroupStatus[agent.GroupName()] = runningStatus + } + rm.runningWorkerGroupTasks[agent.GroupName()] = append(rm.runningWorkerGroupTasks[agent.GroupName()], taskInfo.ID) + for _, worker := range workers { if worker == nil { continue } - rm.workerStatus[worker.Name] = WorkerOngoingStatusRunning - if _, exists := rm.runningWorkerTasks[worker.Name]; !exists { - rm.runningWorkerTasks[worker.Name] = []int32{} - } - rm.runningWorkerTasks[worker.Name] = append(rm.runningWorkerTasks[worker.Name], taskInfo.ID) + rm.workerStatus[worker.Name] = runningStatus } return agent, status, nil } -func (rm *SchedulerResourceManager) GetIdleWorkers() []string { +func (rm *SchedulerResourceManager) GetIdleWorkerGroups() []string { defer rm.Unlock(rm.Lock()) - idleWorkers := make([]string, 0) - for worker, status := range rm.workerStatus { + idleWorkerGroups := make([]string, 0) + for workerGroup, status := range rm.workerGroupStatus { if status == WorkerOngoingStatusIdle { - idleWorkers = append(idleWorkers, worker) + idleWorkerGroups = append(idleWorkerGroups, workerGroup) } } - return idleWorkers + return idleWorkerGroups +} + +func (rm *SchedulerResourceManager) GetConcurrentWorkerGroups() []string { + defer rm.Unlock(rm.Lock()) + + concurrentWorkerGroups := make([]string, 0) + for workerGroup, status := range rm.workerGroupStatus { + if status == WorkerOngoingStatusConcurrentRunning { + concurrentWorkerGroups = append(concurrentWorkerGroups, workerGroup) + } + } + return concurrentWorkerGroups } func (rm *SchedulerResourceManager) changeWorkerStatus(workerName string, status WorkerOngoingStatus) { rm.workerStatus[workerName] = status - if status == WorkerOngoingStatusIdle { + if status == WorkerOngoingStatusIdle || status == WorkerOngoingStatusConcurrentRunning { workerInfo := workerMgr.GetWorkerInfo(workerName) // get worker group name groupName := workerInfo.Group if groupName != "" { // check all workers in this group are idle - allIdle := true + allIdleOrConcurrent := true for _, w := range workerMgr.GetGroupWorkers(groupName) { - if rm.workerStatus[w.Name] != WorkerOngoingStatusIdle { - allIdle = false + if rm.workerStatus[w.Name] != WorkerOngoingStatusIdle && rm.workerStatus[w.Name] != WorkerOngoingStatusConcurrentRunning { + allIdleOrConcurrent = false break } } - if allIdle { - rm.availableWorkerGroups[groupName] = true - } else { - rm.availableWorkerGroups[groupName] = false + if allIdleOrConcurrent { + logrus.Debugf("Change worker group '%s' status to '%s' because all %d workers are idle or concurrent running", groupName, status, len(workerMgr.GetGroupWorkers(groupName))) + rm.changeWorkerGroupStatus(groupName, status) } } } else if status == WorkerOngoingStatusDeleted { delete(rm.workerStatus, workerName) - delete(rm.runningWorkerTasks, workerName) - delete(rm.availableWorkerGroups, workerName) } // TODO: Other status changes can be handled here if needed } +func (rm *SchedulerResourceManager) changeWorkerGroupStatus(workerGroup string, status WorkerOngoingStatus) { + logrus.Infof("Change worker group '%s' status to '%s'", workerGroup, status) + rm.workerGroupStatus[workerGroup] = status +} + // TODO: when sync task created, need to alloc worker? func (rm *SchedulerResourceManager) ChangeWorkerStatus(workerName string, status WorkerOngoingStatus) { defer rm.Unlock(rm.Lock()) diff --git a/vermeer/apps/structure/task.go b/vermeer/apps/structure/task.go index 2c20cdb29..cd75409ec 100644 --- a/vermeer/apps/structure/task.go +++ b/vermeer/apps/structure/task.go @@ -59,6 +59,7 @@ type TaskInfo struct { // for scheduler Priority int32 Preorders []int32 + Exclusive bool // whether the task can be executed concurrently with other tasks } func (ti *TaskInfo) SetState(state TaskState) { From 21ce37d0ebfb83d705ab3501e3b294575bb1ded8 Mon Sep 17 00:00:00 2001 From: ethereal Date: Thu, 31 Jul 2025 14:33:13 +0800 Subject: [PATCH 13/27] chore: add some note for idle workers --- .../schedules/scheduler_algorithm_manager.go | 29 +++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/vermeer/apps/master/schedules/scheduler_algorithm_manager.go b/vermeer/apps/master/schedules/scheduler_algorithm_manager.go index 5778d1d3e..7b0451521 100644 --- a/vermeer/apps/master/schedules/scheduler_algorithm_manager.go +++ b/vermeer/apps/master/schedules/scheduler_algorithm_manager.go @@ -125,6 +125,7 @@ func (f *FIFOSchedulerAlgorithm) ScheduleNextTasks(allTasks []*structure.TaskInf continue // Only consider tasks that are in the waiting state } if group, exists := taskToWorkerGroupMap[task.ID]; exists && group != "" { + // only support idle worker groups for now for _, idleGroup := range idleWorkerGroups { if group == idleGroup { logrus.Debugf("Task %d is assigned to worker group %s", task.ID, group) @@ -163,6 +164,7 @@ func (p *PrioritySchedulerAlgorithm) ScheduleNextTasks(allTasks []*structure.Tas continue // Only consider tasks that are in the waiting state } if group, exists := taskToWorkerGroupMap[task.ID]; exists && group != "" { + // only support idle worker groups for now for _, idleGroup := range idleWorkerGroups { if group == idleGroup { logrus.Debugf("Task %d is assigned to worker group %s", task.ID, group) @@ -226,6 +228,15 @@ func (p *PriorityElderSchedulerAlgorithm) ScheduleNextTasks(allTasks []*structur return []*structure.TaskInfo{task}, nil // Return the first task that can be scheduled } } + // if allow concurrent running, check if the group is in concurrent worker groups + if !task.Exclusive { + for _, concurrentGroup := range concurrentWorkerGroups { + if group == concurrentGroup { + logrus.Debugf("Task %d is assigned to concurrent worker group %s", task.ID, group) + return []*structure.TaskInfo{task}, nil // Return the first task that can be scheduled + } + } + } } } @@ -256,8 +267,21 @@ func (w *WaitingSchedulerAlgorithm) ScheduleNextTasks(allTasks []*structure.Task if len(waitingTasks) == 0 { return nil, nil } - // For waiting tasks, we simply return them as is - return waitingTasks, nil + for _, task := range waitingTasks { + if task.State != structure.TaskStateWaiting { + continue // Only consider tasks that are in the waiting state + } + if group, exists := taskToWorkerGroupMap[task.ID]; exists && group != "" { + // only support idle worker groups for now + for _, idleGroup := range idleWorkerGroups { + if group == idleGroup { + logrus.Debugf("Task %d is assigned to worker group %s", task.ID, group) + return []*structure.TaskInfo{task}, nil // Return the first task that can be scheduled + } + } + } + } + return nil, nil // No tasks scheduled } type DependsSchedulerAlgorithm struct{} @@ -326,6 +350,7 @@ func (d *DependsSchedulerAlgorithm) ScheduleNextTasks(allTasks []*structure.Task } if allDepsSatisfied { if group, exists := taskToWorkerGroupMap[task.ID]; exists && group != "" { + // only support idle worker groups for now for _, idleGroup := range idleWorkerGroups { if group == idleGroup { logrus.Debugf("Task %d is assigned to worker group %s", task.ID, group) From 9f1859947724b77edf050127138df1b7b6be10fc Mon Sep 17 00:00:00 2001 From: ethereal Date: Thu, 31 Jul 2025 14:58:35 +0800 Subject: [PATCH 14/27] chore: add cron manager --- vermeer/apps/master/bl/scheduler_bl.go | 15 ++++ .../schedules/scheduler_cron_manager.go | 75 +++++++++++++++++++ vermeer/apps/structure/task.go | 3 +- vermeer/go.mod | 1 + vermeer/go.sum | 2 + 5 files changed, 95 insertions(+), 1 deletion(-) create mode 100644 vermeer/apps/master/schedules/scheduler_cron_manager.go diff --git a/vermeer/apps/master/bl/scheduler_bl.go b/vermeer/apps/master/bl/scheduler_bl.go index dc66b7c0f..96ecf8461 100644 --- a/vermeer/apps/master/bl/scheduler_bl.go +++ b/vermeer/apps/master/bl/scheduler_bl.go @@ -36,6 +36,8 @@ type ScheduleBl struct { algorithmManager *schedules.SchedulerAlgorithmManager // task management taskManager *schedules.SchedulerTaskManager + // cron management + cronManager *schedules.SchedulerCronManager // start channel for tasks to be started startChan chan *structure.TaskInfo } @@ -60,6 +62,8 @@ func (s *ScheduleBl) Init() { s.taskManager.Init() s.algorithmManager = &schedules.SchedulerAlgorithmManager{} s.algorithmManager.Init() + s.cronManager = &schedules.SchedulerCronManager{} + s.cronManager.Init(s.QueueTask) go s.startTicker() go s.waitingStartedTask() } @@ -164,6 +168,14 @@ func (s *ScheduleBl) QueueTask(taskInfo *structure.TaskInfo) (bool, error) { return ok, err } + if s.cronManager.CheckCronExpression(taskInfo.CronExpr) == nil { + if err := s.cronManager.AddCronTask(taskInfo); err != nil { + logrus.Errorf("failed to add cron task: %v", err) + return false, err + } + logrus.Infof("added cron task for task '%d' with expression '%s'", taskInfo.ID, taskInfo.CronExpr) + } + return ok, nil } @@ -174,6 +186,8 @@ func (s *ScheduleBl) CloseCurrent(taskId int32, removeWorkerName ...string) erro s.taskManager.RemoveTask(taskId) // release the worker group s.resourceManager.ReleaseByTaskID(taskId) + // stop the cron job if exists + s.cronManager.DeleteTask(taskId) if len(removeWorkerName) > 0 { workerName := removeWorkerName[0] @@ -287,6 +301,7 @@ func (s *ScheduleBl) CancelTask(taskInfo *structure.TaskInfo) error { isHeadTask := s.taskManager.IsTaskOngoing(taskInfo.ID) task := s.taskManager.RemoveTask(taskInfo.ID) + s.cronManager.DeleteTask(taskInfo.ID) // err := s.taskManager.CancelTask(taskInfo) isInQueue := false if task != nil { diff --git a/vermeer/apps/master/schedules/scheduler_cron_manager.go b/vermeer/apps/master/schedules/scheduler_cron_manager.go new file mode 100644 index 000000000..6e3b633fd --- /dev/null +++ b/vermeer/apps/master/schedules/scheduler_cron_manager.go @@ -0,0 +1,75 @@ +package schedules + +import ( + "errors" + "vermeer/apps/structure" + + "github.com/robfig/cron/v3" + "github.com/sirupsen/logrus" +) + +type SchedulerCronManager struct { + cronTasks map[int32][]*structure.TaskInfo // cron expression to TaskInfo. Origin task ID to copied tasks + crons map[int32][]*cron.Cron // cron expression to cron jobs + // queueHandler is a function that handles the task queue + queueHandler func(*structure.TaskInfo) (bool, error) +} + +func (t *SchedulerCronManager) Init(queueHandler func(*structure.TaskInfo) (bool, error)) *SchedulerCronManager { + t.cronTasks = make(map[int32][]*structure.TaskInfo) + t.crons = make(map[int32][]*cron.Cron) + t.queueHandler = queueHandler + return t +} + +func (t *SchedulerCronManager) CheckCronExpression(cronExpr string) error { + if cronExpr == "" { + return errors.New("cron expression is empty") + } + if _, err := cron.ParseStandard(cronExpr); err != nil { + return errors.New("invalid cron expression: " + err.Error()) + } + return nil +} + +func (t *SchedulerCronManager) AddCronTask(taskInfo *structure.TaskInfo) error { + if taskInfo == nil { + return errors.New("the argument `taskInfo` is nil") + } + + if taskInfo.CronExpr == "" { + return errors.New("the property `CronExpr` of taskInfo is empty") + } + + t.cronTasks[taskInfo.ID] = append(t.cronTasks[taskInfo.ID], taskInfo) + cronJob := cron.New() + _, err := cronJob.AddFunc(taskInfo.CronExpr, func() { + if taskInfo == nil { + return + } + if _, err := t.queueHandler(taskInfo); err != nil { + logrus.Errorf("Failed to queue task %d in cron job: %v", taskInfo.ID, err) + return + } + }) + if err != nil { + logrus.Errorf("Failed to add cron job for task %d: %v", taskInfo.ID, err) + return err + } + t.crons[taskInfo.ID] = append(t.crons[taskInfo.ID], cronJob) + return nil +} + +func (t *SchedulerCronManager) DeleteTask(taskID int32) error { + if _, exists := t.cronTasks[taskID]; !exists { + return errors.New("task not found in cron tasks") + } + + for _, cronJob := range t.crons[taskID] { + cronJob.Stop() + } + delete(t.cronTasks, taskID) + delete(t.crons, taskID) + logrus.Infof("Deleted cron task for task ID %d", taskID) + return nil +} diff --git a/vermeer/apps/structure/task.go b/vermeer/apps/structure/task.go index cd75409ec..eb0000699 100644 --- a/vermeer/apps/structure/task.go +++ b/vermeer/apps/structure/task.go @@ -59,7 +59,8 @@ type TaskInfo struct { // for scheduler Priority int32 Preorders []int32 - Exclusive bool // whether the task can be executed concurrently with other tasks + Exclusive bool // whether the task can be executed concurrently with other tasks + CronExpr string // cron expression for scheduling } func (ti *TaskInfo) SetState(state TaskState) { diff --git a/vermeer/go.mod b/vermeer/go.mod index 0ba852b4b..7d8542843 100644 --- a/vermeer/go.mod +++ b/vermeer/go.mod @@ -72,6 +72,7 @@ require ( github.com/prometheus/client_model v0.2.1-0.20210607210712-147c58e9608a // indirect github.com/prometheus/common v0.32.1 // indirect github.com/prometheus/procfs v0.7.3 // indirect + github.com/robfig/cron/v3 v3.0.0 // indirect github.com/shurcooL/httpfs v0.0.0-20230704072500-f1e31cf0ba5c // indirect github.com/twitchyliquid64/golang-asm v0.15.1 // indirect github.com/ugorji/go/codec v1.2.12 // indirect diff --git a/vermeer/go.sum b/vermeer/go.sum index c30b27ba5..743516024 100644 --- a/vermeer/go.sum +++ b/vermeer/go.sum @@ -395,6 +395,8 @@ github.com/prometheus/procfs v0.1.3/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4O github.com/prometheus/procfs v0.6.0/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1xBZuNvfVA= github.com/prometheus/procfs v0.7.3 h1:4jVXhlkAyzOScmCkXBTOLRLTz8EeU+eyjrwB/EPq0VU= github.com/prometheus/procfs v0.7.3/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1xBZuNvfVA= +github.com/robfig/cron/v3 v3.0.0 h1:kQ6Cb7aHOHTSzNVNEhmp8EcWKLb4CbiMW9h9VyIhO4E= +github.com/robfig/cron/v3 v3.0.0/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro= github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= github.com/russross/blackfriday v1.5.2/go.mod h1:JO/DiYxRf+HjHt06OyowR9PTA263kcR/rfWxYHBV53g= github.com/ryanuber/columnize v2.1.0+incompatible/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts= From e5790ac5ad13d2b26e3827e49f37788d007daec5 Mon Sep 17 00:00:00 2001 From: ethereal Date: Fri, 1 Aug 2025 01:20:01 +0800 Subject: [PATCH 15/27] chore: add configurations --- vermeer/apps/master/bl/scheduler_bl.go | 62 +++++++++--- .../schedules/scheduler_algorithm_manager.go | 96 +++++++++++++++++-- 2 files changed, 134 insertions(+), 24 deletions(-) diff --git a/vermeer/apps/master/bl/scheduler_bl.go b/vermeer/apps/master/bl/scheduler_bl.go index 96ecf8461..103658b1f 100644 --- a/vermeer/apps/master/bl/scheduler_bl.go +++ b/vermeer/apps/master/bl/scheduler_bl.go @@ -40,20 +40,16 @@ type ScheduleBl struct { cronManager *schedules.SchedulerCronManager // start channel for tasks to be started startChan chan *structure.TaskInfo + // configurations + startChanSize int + tickerInterval int + softSchedule bool } func (s *ScheduleBl) Init() { logrus.Info("Initializing ScheduleBl...") - const defaultChanSizeConfig = "10" - chanSize := common.GetConfigDefault("start_chan_size", defaultChanSizeConfig).(string) - // Convert string to int - chanSizeInt, err := strconv.Atoi(chanSize) - if err != nil { - logrus.Errorf("failed to convert start_chan_size to int: %v", err) - logrus.Infof("using default start_chan_size: %s", defaultChanSizeConfig) - chanSizeInt, _ = strconv.Atoi(defaultChanSizeConfig) - } - startChan := make(chan *structure.TaskInfo, chanSizeInt) + s.LoadConfig() + startChan := make(chan *structure.TaskInfo, s.startChanSize) s.startChan = startChan s.resourceManager = &schedules.SchedulerResourceManager{} @@ -68,10 +64,47 @@ func (s *ScheduleBl) Init() { go s.waitingStartedTask() } +func (s *ScheduleBl) LoadConfig() { + // Load configuration from common package + + // startChanSize + const defaultChanSizeConfig = "10" + chanSize := common.GetConfigDefault("start_chan_size", defaultChanSizeConfig).(string) + // Convert string to int + chanSizeInt, err := strconv.Atoi(chanSize) + if err != nil { + logrus.Errorf("failed to convert start_chan_size to int: %v", err) + logrus.Infof("using default start_chan_size: %s", defaultChanSizeConfig) + chanSizeInt, _ = strconv.Atoi(defaultChanSizeConfig) + } + s.startChanSize = chanSizeInt + + // tickerInterval + const defaultTickerInterval = "3" + tickerInterval := common.GetConfigDefault("ticker_interval", defaultTickerInterval).(string) + tickerIntervalInt, err := strconv.Atoi(tickerInterval) + if err != nil { + logrus.Errorf("failed to convert ticker_interval to int: %v", err) + logrus.Infof("using default ticker_interval: %s", defaultTickerInterval) + tickerIntervalInt, _ = strconv.Atoi(defaultTickerInterval) + } + s.tickerInterval = tickerIntervalInt + + // softSchedule + softSchedule := common.GetConfigDefault("soft_schedule", "true").(string) + if softSchedule == "true" { + s.softSchedule = true + } else { + s.softSchedule = false + } + + logrus.Infof("ScheduleBl configuration: startChanSize=%d, tickerInterval=%d, softSchedule=%v", + s.startChanSize, s.tickerInterval, s.softSchedule) +} + func (s *ScheduleBl) startTicker() { - // Create a ticker that triggers every 3 seconds - // TODO: make it configurable - ticker := time.Tick(3 * time.Second) + // Create a ticker with the specified interval + ticker := time.Tick(time.Duration(s.tickerInterval) * time.Second) for range ticker { logrus.Debug("Ticker ticked") @@ -87,8 +120,7 @@ func (s *ScheduleBl) TryScheduleNextTasks() { } }() - // TODO: make it configurable - if err := s.tryScheduleInner(true); err != nil { + if err := s.tryScheduleInner(s.softSchedule); err != nil { logrus.Errorf("do scheduling error:%v", err) } } diff --git a/vermeer/apps/master/schedules/scheduler_algorithm_manager.go b/vermeer/apps/master/schedules/scheduler_algorithm_manager.go index 7b0451521..65de51d2d 100644 --- a/vermeer/apps/master/schedules/scheduler_algorithm_manager.go +++ b/vermeer/apps/master/schedules/scheduler_algorithm_manager.go @@ -2,7 +2,9 @@ package schedules import ( "sort" + "strconv" "time" + "vermeer/apps/common" "vermeer/apps/structure" "github.com/sirupsen/logrus" @@ -11,6 +13,8 @@ import ( type SchedulerAlgorithm interface { // Name returns the name of the SchedulerAlgorithm Name() string + // Init initializes the SchedulerAlgorithm + Init() // FilterNextTasks filters the next tasks to be scheduled based on the provided parameters FilterNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkerGroups []string, concurrentWorkerGroups []string, softSchedule bool) ([]*structure.TaskInfo, error) // ScheduleNextTasks schedules the next tasks based on the filtered tasks @@ -47,18 +51,20 @@ func (am *SchedulerAlgorithmManager) RegisterSchedulerAlgorithm(SchedulerAlgorit if len(am.schuduledSchedulerAlgorithms) > 0 { return // Only one scheduling algorithm can be registered } + SchedulerAlgorithm.Init() am.schuduledSchedulerAlgorithms[name] = SchedulerAlgorithm } -func (am *SchedulerAlgorithmManager) RegisterFilterAlgorithm(SchedulerAlgorithm SchedulerAlgorithm) { - if SchedulerAlgorithm == nil { +func (am *SchedulerAlgorithmManager) RegisterFilterAlgorithm(FilterAlgorithm SchedulerAlgorithm) { + if FilterAlgorithm == nil { return } - name := SchedulerAlgorithm.Name() + name := FilterAlgorithm.Name() if _, exists := am.filteredSchedulerAlgorithms[name]; exists { return // SchedulerAlgorithm already registered } - am.filteredSchedulerAlgorithms[name] = SchedulerAlgorithm + FilterAlgorithm.Init() + am.filteredSchedulerAlgorithms[name] = FilterAlgorithm } func (am *SchedulerAlgorithmManager) IsDispatchPaused() bool { @@ -109,6 +115,11 @@ func (f *FIFOSchedulerAlgorithm) Name() string { return "FIFO" } +func (f *FIFOSchedulerAlgorithm) Init() { + // No specific initialization needed for FIFO + logrus.Info("Initializing FIFOSchedulerAlgorithm") +} + func (f *FIFOSchedulerAlgorithm) FilterNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkerGroups []string, concurrentWorkerGroups []string, softSchedule bool) ([]*structure.TaskInfo, error) { // just return the waiting tasks as is for FIFO return allTasks, nil @@ -144,6 +155,11 @@ func (p *PrioritySchedulerAlgorithm) Name() string { return "Priority" } +func (p *PrioritySchedulerAlgorithm) Init() { + // No specific initialization needed for Priority + logrus.Info("Initializing PrioritySchedulerAlgorithm") +} + func (p *PrioritySchedulerAlgorithm) FilterNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkerGroups []string, concurrentWorkerGroups []string, softSchedule bool) ([]*structure.TaskInfo, error) { // just return the waiting tasks as is for Priority return allTasks, nil @@ -177,12 +193,64 @@ func (p *PrioritySchedulerAlgorithm) ScheduleNextTasks(allTasks []*structure.Tas return nil, nil } -type PriorityElderSchedulerAlgorithm struct{} +type PriorityElderSchedulerAlgorithm struct { + ageParam int64 + priorityParam int64 + resourceParam int64 + randomValueParam int64 +} func (p *PriorityElderSchedulerAlgorithm) Name() string { return "PriorityElder" } +func (p *PriorityElderSchedulerAlgorithm) Init() { + logrus.Info("Initializing PriorityElderSchedulerAlgorithm") + + // Initialize parameters with default values + defaultAgeParam := "1" + defaultPriorityParam := "1" + defaultResourceParam := "10000000000" + defaultRandomValueParam := "1" // Placeholder for any random value logic + + // Load parameters from configuration + ageParam := common.GetConfigDefault("priority_elder_age_param", defaultAgeParam).(string) + priorityParam := common.GetConfigDefault("priority_elder_priority_param", defaultPriorityParam).(string) + resourceParam := common.GetConfigDefault("priority_elder_resource_param", defaultResourceParam).(string) + randomValueParam := common.GetConfigDefault("priority_elder_random_value_param", defaultRandomValueParam).(string) + + ageParamInt, err := strconv.Atoi(ageParam) + if err != nil { + logrus.Errorf("failed to convert priority_elder_age_param to int: %v", err) + logrus.Infof("using default priority_elder_age_param: %s", defaultAgeParam) + ageParamInt, _ = strconv.Atoi(defaultAgeParam) + } + p.ageParam = int64(ageParamInt) + priorityParamInt, err := strconv.Atoi(priorityParam) + if err != nil { + logrus.Errorf("failed to convert priority_elder_priority_param to int: %v", err) + logrus.Infof("using default priority_elder_priority_param: %s", defaultPriorityParam) + priorityParamInt, _ = strconv.Atoi(defaultPriorityParam) + } + p.priorityParam = int64(priorityParamInt) + resourceParamInt, err := strconv.Atoi(resourceParam) + if err != nil { + logrus.Errorf("failed to convert priority_elder_resource_param to int: %v", err) + logrus.Infof("using default priority_elder_resource_param: %s", defaultResourceParam) + resourceParamInt, _ = strconv.Atoi(defaultResourceParam) + } + p.resourceParam = int64(resourceParamInt) + randomValueParamInt, err := strconv.Atoi(randomValueParam) + if err != nil { + logrus.Errorf("failed to convert priority_elder_random_value_param to int: %v", err) + logrus.Infof("using default priority_elder_random_value_param: %s", defaultRandomValueParam) + randomValueParamInt, _ = strconv.Atoi(defaultRandomValueParam) + } + p.randomValueParam = int64(randomValueParamInt) + + logrus.Infof("PriorityElderSchedulerAlgorithm initialized with parameters: ageParam=%d, priorityParam=%d, resourceParam=%d, randomValueParam=%d", p.ageParam, p.priorityParam, p.resourceParam, p.randomValueParam) +} + func (p *PriorityElderSchedulerAlgorithm) FilterNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkerGroups []string, concurrentWorkerGroups []string, softSchedule bool) ([]*structure.TaskInfo, error) { // just return the waiting tasks as is for PriorityElder return allTasks, nil @@ -190,10 +258,10 @@ func (p *PriorityElderSchedulerAlgorithm) FilterNextTasks(allTasks []*structure. func (p *PriorityElderSchedulerAlgorithm) CalculateTaskEmergency(task *structure.TaskInfo, taskToWorkerGroupMap map[int32]string) int64 { // step 0: get params - ageParam := int64(1) - priorityParam := int64(1) - resourceParam := int64(1e10) - randomValueParam := int64(1) + ageParam := p.ageParam + priorityParam := p.priorityParam + resourceParam := p.resourceParam + randomValueParam := p.randomValueParam // step 1: age ageCost := ageParam * time.Since(task.CreateTime).Milliseconds() / 1000 // in seconds // step 2: priority @@ -249,6 +317,11 @@ func (w *WaitingSchedulerAlgorithm) Name() string { return "Waiting" } +func (w *WaitingSchedulerAlgorithm) Init() { + // No specific initialization needed for Waiting + logrus.Info("Initializing WaitingSchedulerAlgorithm") +} + func (w *WaitingSchedulerAlgorithm) FilterNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkerGroups []string, concurrentWorkerGroups []string, softSchedule bool) ([]*structure.TaskInfo, error) { waitingTasks := make([]*structure.TaskInfo, 0) for _, task := range allTasks { @@ -290,6 +363,11 @@ func (d *DependsSchedulerAlgorithm) Name() string { return "Depends" } +func (d *DependsSchedulerAlgorithm) Init() { + // No specific initialization needed for Depends + logrus.Info("Initializing DependsSchedulerAlgorithm") +} + func (d *DependsSchedulerAlgorithm) FilterNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkerGroups []string, concurrentWorkerGroups []string, softSchedule bool) ([]*structure.TaskInfo, error) { if len(allTasks) == 0 { return nil, nil // No tasks to schedule From d503951c53610e59f581198efa3f5e2ad8c90079 Mon Sep 17 00:00:00 2001 From: ethereal Date: Tue, 5 Aug 2025 22:23:08 +0800 Subject: [PATCH 16/27] test: add some test for priority --- .../schedules/scheduler_algorithm_manager.go | 12 +++++-- vermeer/test/functional/compute_base.go | 34 +++++++++++++++++++ vermeer/test/functional/compute_task.go | 1 + vermeer/vermeer_test.go | 3 +- 4 files changed, 46 insertions(+), 4 deletions(-) diff --git a/vermeer/apps/master/schedules/scheduler_algorithm_manager.go b/vermeer/apps/master/schedules/scheduler_algorithm_manager.go index 65de51d2d..8a4333140 100644 --- a/vermeer/apps/master/schedules/scheduler_algorithm_manager.go +++ b/vermeer/apps/master/schedules/scheduler_algorithm_manager.go @@ -256,7 +256,7 @@ func (p *PriorityElderSchedulerAlgorithm) FilterNextTasks(allTasks []*structure. return allTasks, nil } -func (p *PriorityElderSchedulerAlgorithm) CalculateTaskEmergency(task *structure.TaskInfo, taskToWorkerGroupMap map[int32]string) int64 { +func (p *PriorityElderSchedulerAlgorithm) CalculateTaskEmergency(task *structure.TaskInfo, taskToWorkerGroupMap map[int32]string, printValue bool) int64 { // step 0: get params ageParam := p.ageParam priorityParam := p.priorityParam @@ -271,7 +271,9 @@ func (p *PriorityElderSchedulerAlgorithm) CalculateTaskEmergency(task *structure resourceCost := resourceParam / max(1, graph.VertexCount+graph.EdgeCount) // Avoid division by zero, ensure at least 1 // step 4: some random value randomValue := int64(randomValueParam) // Placeholder for any random value logic - logrus.Debugf("Task %d: Age Cost: %d, Priority Cost: %d, Resource Cost: %d, Random Value: %d", task.ID, ageCost, priorityCost, resourceCost, randomValue) + if printValue { + logrus.Debugf("Task %d: Age Cost: %d, Priority Cost: %d, Resource Cost: %d, Random Value: %d", task.ID, ageCost, priorityCost, resourceCost, randomValue) + } return ageCost + priorityCost + resourceCost + randomValue } @@ -282,9 +284,13 @@ func (p *PriorityElderSchedulerAlgorithm) ScheduleNextTasks(allTasks []*structur // Sort tasks by priority (higher priority first) sort.Slice(allTasks, func(i, j int) bool { - return p.CalculateTaskEmergency(allTasks[i], taskToWorkerGroupMap) > p.CalculateTaskEmergency(allTasks[j], taskToWorkerGroupMap) + return p.CalculateTaskEmergency(allTasks[i], taskToWorkerGroupMap, false) > p.CalculateTaskEmergency(allTasks[j], taskToWorkerGroupMap, false) }) + for _, task := range allTasks { + logrus.Debugf("Task %d: Emergency Value: %d", task.ID, p.CalculateTaskEmergency(task, taskToWorkerGroupMap, true)) + } + for _, task := range allTasks { if task.State != structure.TaskStateWaiting { continue // Only consider tasks that are in the waiting state diff --git a/vermeer/test/functional/compute_base.go b/vermeer/test/functional/compute_base.go index 256c72941..11e67d948 100644 --- a/vermeer/test/functional/compute_base.go +++ b/vermeer/test/functional/compute_base.go @@ -97,6 +97,40 @@ func (ctb *ComputeTaskBase) SendComputeReqAsync(params map[string]string) { require.Equal(ctb.t, "complete", taskResp.Task.Status) } +func (ctb *ComputeTaskBase) SendComputeReqAsyncBatchPriority(num int, params map[string]string) { + //create Compute Task + tasks := make([]client.TaskInfo, 0, num) + for i := 0; i < num; i++ { + params["priority"] = strconv.Itoa(i % 10) // 设置不同的优先级 + resp, err := ctb.masterHttp.CreateTaskAsync(client.TaskCreateRequest{ + TaskType: "compute", + GraphName: ctb.graphName, + Params: params, + }) + require.NoError(ctb.t, err) + tasks = append(tasks, resp.Task) + } + + for i := 0; i < num; i++ { + ctb.taskID = int(tasks[i].ID) + //若成功启动Compute Task,开始轮询tasksGet,解析response,得到状态为完成时break。 + var taskResp *client.TaskResponse + var err error + for i := 0; i < ctb.waitSecond; i++ { + ctb.healthCheck.DoHealthCheck() + taskResp, err = ctb.masterHttp.GetTask(ctb.taskID) + require.NoError(ctb.t, err) + if taskResp.Task.Status == "complete" { + break + } + require.NotEqual(ctb.t, "error", taskResp.Task.Status) + time.Sleep(1 * time.Second) + } + require.Equal(ctb.t, "complete", taskResp.Task.Status) + fmt.Printf("Compute Task %d completed successfully\n", ctb.taskID) + } +} + // SendComputeReqSync // // @Description: 发送Http请求,无需重写,同步请求 diff --git a/vermeer/test/functional/compute_task.go b/vermeer/test/functional/compute_task.go index 14ee5a52b..96939e134 100644 --- a/vermeer/test/functional/compute_task.go +++ b/vermeer/test/functional/compute_task.go @@ -36,6 +36,7 @@ type ComputeTask interface { masterHttp *client.VermeerClient, t *testing.T, healthCheck *HealthCheck) TaskComputeBody() map[string]string SendComputeReqAsync(params map[string]string) + SendComputeReqAsyncBatchPriority(num int, params map[string]string) SendComputeReqSync(params map[string]string) LoadComputeRes() ([]interface{}, error) CheckRes() diff --git a/vermeer/vermeer_test.go b/vermeer/vermeer_test.go index e3e4158a7..86c2ecfde 100644 --- a/vermeer/vermeer_test.go +++ b/vermeer/vermeer_test.go @@ -158,7 +158,8 @@ func testAlgorithms(t *testing.T) { taskComputeBody := computeTest.TaskComputeBody() taskComputeBody["output.need_query"] = needQuery if sendType == "async" { - computeTest.SendComputeReqAsync(taskComputeBody) + // computeTest.SendComputeReqAsync(taskComputeBody) + computeTest.SendComputeReqAsyncBatchPriority(10, taskComputeBody) // 异步发送多个请求 } else { computeTest.SendComputeReqSync(taskComputeBody) } From dc9808b9286fc49592df7a8ec7fb9a460c7f020a Mon Sep 17 00:00:00 2001 From: ethereal Date: Thu, 11 Sep 2025 01:31:16 +0800 Subject: [PATCH 17/27] chore: add some test for priority --- vermeer/apps/graphio/local_file.go | 1 + vermeer/apps/master/bl/scheduler_bl.go | 39 ++++++- vermeer/apps/master/bl/task_bl.go | 5 +- .../schedules/scheduler_task_manager.go | 42 ++++++- vermeer/apps/master/services/http_tasks.go | 32 +++++- vermeer/apps/master/services/router.go | 7 +- vermeer/client/client.go | 50 ++++++++ vermeer/client/structure.go | 13 +++ vermeer/config/master.ini | 2 +- vermeer/test/functional/compute_base.go | 43 +++++-- vermeer/test/functional/compute_task.go | 2 +- vermeer/test/functional/load_local.go | 26 +++++ vermeer/test/scheduler/batch.go | 14 +++ vermeer/test/scheduler/priority.go | 108 ++++++++++++++++++ vermeer/test/scheduler/routine.go | 13 +++ vermeer/test/scheduler/test_scheduler.go | 60 ++++++++++ vermeer/vermeer_test.go | 11 +- 17 files changed, 443 insertions(+), 25 deletions(-) create mode 100644 vermeer/test/scheduler/batch.go create mode 100644 vermeer/test/scheduler/priority.go create mode 100644 vermeer/test/scheduler/routine.go create mode 100644 vermeer/test/scheduler/test_scheduler.go diff --git a/vermeer/apps/graphio/local_file.go b/vermeer/apps/graphio/local_file.go index 8cdb72623..7d6119304 100644 --- a/vermeer/apps/graphio/local_file.go +++ b/vermeer/apps/graphio/local_file.go @@ -82,6 +82,7 @@ func (a *LocalMaker) MakeTasks(params map[string]string, taskID int32) ([]LoadPa logrus.Errorf(s) return nil, errors.New(s) } + logrus.Debugf("MakeTask LoadTypeLocal parse file: %s, s:%d, e:%d", files, s, e) for i := s; i <= e; i++ { part := LoadPartition{} part.Init(partID, taskID, LoadPartTypeVertex) diff --git a/vermeer/apps/master/bl/scheduler_bl.go b/vermeer/apps/master/bl/scheduler_bl.go index 103658b1f..146f24364 100644 --- a/vermeer/apps/master/bl/scheduler_bl.go +++ b/vermeer/apps/master/bl/scheduler_bl.go @@ -176,7 +176,6 @@ func (s *ScheduleBl) tryScheduleInner(softSchedule bool) error { } // QueueTask Add the task to the inner queue. -// The tasks will be executed in order from the queue. // If the task exists, return false. func (s *ScheduleBl) QueueTask(taskInfo *structure.TaskInfo) (bool, error) { if taskInfo == nil { @@ -192,6 +191,8 @@ func (s *ScheduleBl) QueueTask(taskInfo *structure.TaskInfo) (bool, error) { return false, err } + logrus.Debugf("queuing task %d with parameters: %+v", taskInfo.ID, taskInfo) + // Notice: Ensure successful invocation. // make sure all tasks have alloc to a worker group ok, err := s.taskManager.QueueTask(taskInfo) @@ -211,6 +212,31 @@ func (s *ScheduleBl) QueueTask(taskInfo *structure.TaskInfo) (bool, error) { return ok, nil } +func (s *ScheduleBl) BatchQueueTask(taskInfos []*structure.TaskInfo) ([]bool, []error) { + if len(taskInfos) == 0 { + return []bool{}, []error{} + } + + s.PauseDispatch() + + defer s.ResumeDispatch() + defer s.Unlock(s.Lock()) + + errors := make([]error, len(taskInfos)) + oks := make([]bool, len(taskInfos)) + + for _, taskInfo := range taskInfos { + ok, err := s.QueueTask(taskInfo) + if err != nil { + logrus.Errorf("failed to queue task '%d': %v", taskInfo.ID, err) + errors = append(errors, err) + } + oks = append(oks, ok) + } + + return oks, errors +} + // ******** CloseCurrent ******** func (s *ScheduleBl) CloseCurrent(taskId int32, removeWorkerName ...string) error { @@ -317,6 +343,12 @@ func (s *ScheduleBl) startWaitingTask(agent *schedules.Agent, taskInfo *structur taskInfo.StartTime = time.Now() err = taskStarter.StartTask() + + // only for test or debug, record the task start sequence + if err := s.taskManager.AddTaskStartSequence(taskInfo.ID); err != nil { + logrus.Errorf("failed to add task '%d' to start sequence: %v", taskInfo.ID, err) + } + if err != nil { logrus.Errorf("failed to start a task, type: %s, taskID: %d, caused by: %v", taskInfo.Type, taskInfo.ID, err) taskMgr.SetError(taskInfo, err.Error()) @@ -406,3 +438,8 @@ func (s *ScheduleBl) TasksInQueue(space string) []*structure.TaskInfo { // Implement logic to get tasks in the queue for a specific space return s.taskManager.GetTasksInQueue(space) } + +func (s *ScheduleBl) TaskStartSequence(queryTasks []int32) []*structure.TaskInfo { + // Only for debug or test, get task start sequence + return s.taskManager.GetTaskStartSequence(queryTasks) +} diff --git a/vermeer/apps/master/bl/task_bl.go b/vermeer/apps/master/bl/task_bl.go index a8b6e50ce..9070a6f29 100644 --- a/vermeer/apps/master/bl/task_bl.go +++ b/vermeer/apps/master/bl/task_bl.go @@ -239,13 +239,10 @@ func QueueExecuteTask(taskInfo *structure.TaskInfo) error { } func QueueExecuteTasks(tasksInfo []*structure.TaskInfo) []error { - defer Scheduler.Unlock(Scheduler.Lock()) - errs := make([]error, 0, len(tasksInfo)) for _, task := range tasksInfo { task.CreateType = structure.TaskCreateAsync - _, err := Scheduler.QueueTask(task) - errs = append(errs, err) } + _, errs := Scheduler.BatchQueueTask(tasksInfo) return errs } diff --git a/vermeer/apps/master/schedules/scheduler_task_manager.go b/vermeer/apps/master/schedules/scheduler_task_manager.go index 0683faaf8..2f850f23b 100644 --- a/vermeer/apps/master/schedules/scheduler_task_manager.go +++ b/vermeer/apps/master/schedules/scheduler_task_manager.go @@ -3,13 +3,16 @@ package schedules import ( "errors" "vermeer/apps/structure" + + "github.com/sirupsen/logrus" ) type SchedulerTaskManager struct { // This struct is responsible for managing tasks in the scheduling system. // A map from task ID to TaskInfo can be used to track tasks. - allTaskMap map[int32]*structure.TaskInfo - allTaskQueue []*structure.TaskInfo + allTaskMap map[int32]*structure.TaskInfo + allTaskQueue []*structure.TaskInfo + startTaskQueue []*structure.TaskInfo // A map from task ID to worker group can be used to track which worker group is handling which task. taskToworkerGroupMap map[int32]string } @@ -31,10 +34,20 @@ func (t *SchedulerTaskManager) QueueTask(taskInfo *structure.TaskInfo) (bool, er // Add the task to the task map t.allTaskMap[taskInfo.ID] = taskInfo + t.allTaskQueue = append(t.allTaskQueue, taskInfo) t.AssignGroup(taskInfo) return true, nil } +// Only for debug or test, get task start sequence +func (t *SchedulerTaskManager) AddTaskStartSequence(taskID int32) error { + if _, exists := t.allTaskMap[taskID]; !exists { + return errors.New("task not found") + } + t.startTaskQueue = append(t.startTaskQueue, t.allTaskMap[taskID]) + return nil +} + func (t *SchedulerTaskManager) RemoveTask(taskID int32) error { if _, exists := t.allTaskMap[taskID]; !exists { return errors.New("task not found") @@ -103,6 +116,31 @@ func (t *SchedulerTaskManager) GetTasksInQueue(space string) []*structure.TaskIn return tasks } +// Only for debug or test, get task start sequence +func (t *SchedulerTaskManager) GetTaskStartSequence(queryTasks []int32) []*structure.TaskInfo { + if len(t.startTaskQueue) == 0 { + return nil + } + if len(queryTasks) == 0 { + return t.startTaskQueue + } + tasks := make([]*structure.TaskInfo, 0, len(queryTasks)) + taskSet := make(map[int32]struct{}) + for _, id := range queryTasks { + taskSet[id] = struct{}{} + } + for _, task := range t.startTaskQueue { + if _, exists := taskSet[task.ID]; exists { + tasks = append(tasks, task) + } + } + logrus.Infof("GetTaskStartSequence: return %d tasks", len(tasks)) + for _, task := range tasks { + logrus.Debugf("TaskID: %d", task.ID) + } + return tasks +} + func (t *SchedulerTaskManager) GetTaskToWorkerGroupMap() map[int32]string { // Return a copy of the worker group map to avoid external modifications groupMap := make(map[int32]string, len(t.taskToworkerGroupMap)) diff --git a/vermeer/apps/master/services/http_tasks.go b/vermeer/apps/master/services/http_tasks.go index 03130ca80..de4c5eaed 100644 --- a/vermeer/apps/master/services/http_tasks.go +++ b/vermeer/apps/master/services/http_tasks.go @@ -112,7 +112,7 @@ func handleTaskCreation(ctx *gin.Context, exeFunc func(*structure.TaskInfo) erro } filteredTask := taskBiz(ctx).FilteringTask(task) - ctx.JSON(http.StatusOK, TaskResp{Task: taskInfo2TaskJson(filteredTask)}) + ctx.JSON(http.StatusOK, TaskCreateResponse{Task: taskInfo2TaskJson(filteredTask)}) } type TaskCreateBatchHandler struct { @@ -231,3 +231,33 @@ func (ch *ComputeValueHandler) GET(ctx *gin.Context) { ctx.JSON(http.StatusOK, resp) } + +type TaskStartSequenceHandler struct { + common.SenHandler +} + +type TaskStartSequenceRequest struct { + QueryTasks []int32 `json:"query_tasks,omitempty"` +} + +type TaskStartSequenceResp struct { + common.BaseResp + Sequence []int32 `json:"sequence,omitempty"` +} + +func (tsh *TaskStartSequenceHandler) POST(ctx *gin.Context) { + req := TaskStartSequenceRequest{} + err := ctx.BindJSON(&req) + if isBad(err != nil, ctx, func() string { return fmt.Sprintf("request body not correct: %s", err) }) { + return + } + + r := Scheduler.TaskStartSequence(req.QueryTasks) + + sequence := make([]int32, 0, 1) + for _, task := range r { + sequence = append(sequence, int32(task.ID)) + } + + ctx.JSON(http.StatusOK, TaskStartSequenceResp{Sequence: sequence, BaseResp: common.BaseResp{ErrCode: 0, Message: "ok"}}) +} diff --git a/vermeer/apps/master/services/router.go b/vermeer/apps/master/services/router.go index e1000d25c..ac01e8340 100644 --- a/vermeer/apps/master/services/router.go +++ b/vermeer/apps/master/services/router.go @@ -35,12 +35,13 @@ func SetRouters(sen *common.Sentinel, authFilters ...gin.HandlerFunc) { // /tasks regVerAPI(sen, 1, "/tasks", map[string]common.BaseHandler{ - "": &TasksHandler{}, - "/create": &TaskCreateHandler{}, - // "/create/batch": &TaskCreateBatchHandler{}, + "": &TasksHandler{}, + "/create": &TaskCreateHandler{}, + "/create/batch": &TaskCreateBatchHandler{}, "/create/sync": &TaskCreateSyncHandler{}, "/oltp": &OltpHandler{}, "/value/:task_id": &ComputeValueHandler{}, + "/start_sequence": &TaskStartSequenceHandler{}, }, authFilters...) // /task diff --git a/vermeer/client/client.go b/vermeer/client/client.go index d9d1a17f9..04c50a237 100644 --- a/vermeer/client/client.go +++ b/vermeer/client/client.go @@ -228,6 +228,31 @@ func (vc *VermeerClient) CreateTaskSync(request TaskCreateRequest) (*TaskRespons return taskResponse, err } +func (vc *VermeerClient) CreateTaskBatch(request TaskCreateBatchRequest) (*TaskBatchCreateResponse, error) { + reader, err := Request2Reader(request) + if err != nil { + return nil, err + } + resp, err := vc.post(vc.httpAddr+"/tasks/create/batch", reader) + if err != nil { + return nil, err + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + respByte, err := ParseResponse2Byte(resp) + if err != nil { + return nil, err + } + return nil, fmt.Errorf("response:%s", string(respByte)) + } + taskResp := &TaskBatchCreateResponse{} + err = ParseResponse2Any(resp, taskResp) + if err != nil { + return nil, err + } + return taskResp, err +} + func (vc *VermeerClient) GetTasks() (*TasksResponse, error) { resp, err := vc.get(vc.httpAddr + "/tasks") if err != nil { @@ -270,6 +295,31 @@ func (vc *VermeerClient) GetTask(taskID int) (*TaskResponse, error) { return taskResp, err } +func (vc *VermeerClient) GetTaskStartSequence(queryTasks []int32) (*TaskStartSequenceResp, error) { + reader, err := Request2Reader(TaskStartSequenceRequest{QueryTasks: queryTasks}) + if err != nil { + return nil, err + } + resp, err := vc.post(vc.httpAddr+"/tasks/start_sequence", reader) + if err != nil { + return nil, err + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + respByte, err := ParseResponse2Byte(resp) + if err != nil { + return nil, err + } + return nil, fmt.Errorf("response:%s", string(respByte)) + } + taskResp := &TaskStartSequenceResp{} + err = ParseResponse2Any(resp, taskResp) + if err != nil { + return nil, err + } + return taskResp, err +} + func (vc *VermeerClient) GetEdges(graphName string, vertexID string, direction string) (*EdgesResponse, error) { resp, err := vc.get(vc.httpAddr + "/graphs/" + graphName + "/edges?vertex_id=" + vertexID + "&direction=" + direction) if err != nil { diff --git a/vermeer/client/structure.go b/vermeer/client/structure.go index ceeead394..ee233b42f 100644 --- a/vermeer/client/structure.go +++ b/vermeer/client/structure.go @@ -38,6 +38,17 @@ type TaskResponse struct { Task TaskInfo `json:"task,omitempty"` } +type TaskStartSequenceRequest struct { + QueryTasks []int32 `json:"query_tasks,omitempty"` +} + +type TaskStartSequenceResp struct { + BaseResponse + Sequence []int32 `json:"sequence,omitempty"` +} + +type TaskBatchCreateResponse []TaskResponse + type TaskInfo struct { ID int32 `json:"id,omitempty"` Status string `json:"status,omitempty"` @@ -161,6 +172,8 @@ type TaskCreateRequest struct { Params map[string]string `json:"params"` } +type TaskCreateBatchRequest []TaskCreateRequest + type GraphCreateRequest struct { Name string `json:"name,omitempty"` } diff --git a/vermeer/config/master.ini b/vermeer/config/master.ini index 8a7adb133..11827608a 100644 --- a/vermeer/config/master.ini +++ b/vermeer/config/master.ini @@ -14,7 +14,7 @@ ; limitations under the License. [default] -log_level=info +log_level=debug debug_mode=release http_peer=0.0.0.0:6688 grpc_peer=0.0.0.0:6689 diff --git a/vermeer/test/functional/compute_base.go b/vermeer/test/functional/compute_base.go index 11e67d948..5bdb1d64c 100644 --- a/vermeer/test/functional/compute_base.go +++ b/vermeer/test/functional/compute_base.go @@ -97,21 +97,34 @@ func (ctb *ComputeTaskBase) SendComputeReqAsync(params map[string]string) { require.Equal(ctb.t, "complete", taskResp.Task.Status) } -func (ctb *ComputeTaskBase) SendComputeReqAsyncBatchPriority(num int, params map[string]string) { +func (ctb *ComputeTaskBase) SendComputeReqAsyncBatchPriority(params []map[string]string) ([]int32, []int32) { //create Compute Task - tasks := make([]client.TaskInfo, 0, num) - for i := 0; i < num; i++ { - params["priority"] = strconv.Itoa(i % 10) // 设置不同的优先级 - resp, err := ctb.masterHttp.CreateTaskAsync(client.TaskCreateRequest{ + tasks := make([]client.TaskInfo, 0, len(params)) + taskIds := make([]int32, 0, len(params)) + createTasksParams := client.TaskCreateBatchRequest{} + for i := 0; i < len(params); i++ { + graph := ctb.graphName + if params[i]["graph_name"] != "" { + graph = params[i]["graph_name"] + } + createTasksParams = append(createTasksParams, client.TaskCreateRequest{ TaskType: "compute", - GraphName: ctb.graphName, - Params: params, + GraphName: graph, + Params: params[i], }) - require.NoError(ctb.t, err) - tasks = append(tasks, resp.Task) + } + resp, err := ctb.masterHttp.CreateTaskBatch(createTasksParams) + require.NoError(ctb.t, err) + + for i, r := range *resp { + if r.BaseResponse.ErrCode != 0 { + ctb.t.Fatalf("create compute task %d failed: %s", i, r.BaseResponse.Message) + } + tasks = append(tasks, r.Task) + taskIds = append(taskIds, r.Task.ID) } - for i := 0; i < num; i++ { + for i := 0; i < len(params); i++ { ctb.taskID = int(tasks[i].ID) //若成功启动Compute Task,开始轮询tasksGet,解析response,得到状态为完成时break。 var taskResp *client.TaskResponse @@ -129,6 +142,16 @@ func (ctb *ComputeTaskBase) SendComputeReqAsyncBatchPriority(num int, params map require.Equal(ctb.t, "complete", taskResp.Task.Status) fmt.Printf("Compute Task %d completed successfully\n", ctb.taskID) } + + response, err := ctb.masterHttp.GetTaskStartSequence(taskIds) + require.NoError(ctb.t, err) + sequence := response.Sequence + for i, id := range sequence { + fmt.Printf("Task %d started at position %d in the sequence\n", id, i+1) + } + require.Equal(ctb.t, len(taskIds), len(sequence)) + + return taskIds, sequence } // SendComputeReqSync diff --git a/vermeer/test/functional/compute_task.go b/vermeer/test/functional/compute_task.go index 96939e134..3ab529666 100644 --- a/vermeer/test/functional/compute_task.go +++ b/vermeer/test/functional/compute_task.go @@ -36,7 +36,7 @@ type ComputeTask interface { masterHttp *client.VermeerClient, t *testing.T, healthCheck *HealthCheck) TaskComputeBody() map[string]string SendComputeReqAsync(params map[string]string) - SendComputeReqAsyncBatchPriority(num int, params map[string]string) + SendComputeReqAsyncBatchPriority(params []map[string]string) ([]int32, []int32) SendComputeReqSync(params map[string]string) LoadComputeRes() ([]interface{}, error) CheckRes() diff --git a/vermeer/test/functional/load_local.go b/vermeer/test/functional/load_local.go index 8da57f048..bf217cf07 100644 --- a/vermeer/test/functional/load_local.go +++ b/vermeer/test/functional/load_local.go @@ -21,6 +21,9 @@ package functional import ( "math/rand" + "strconv" + + "github.com/sirupsen/logrus" ) type LoadTaskLocal struct { @@ -43,3 +46,26 @@ func (lt *LoadTaskLocal) TaskLoadBody() map[string]string { "load.vertex_backend": vertexBackends[rand.Intn(len(vertexBackends))], } } + +func (lt *LoadTaskLocal) TaskLoadBodyWithNum(num int) map[string]string { + vertexBackends := []string{"db", "mem"} + + if num <= 10 { + num = 30 + } + + logrus.Infof("load with num: " + strconv.Itoa(num-1)) + + return map[string]string{ + "load.parallel": "100", + "load.type": "local", + "load.use_property": "0", + //"load.use_outedge": "1", + //"load.use_out_degree": "1", + //"load.use_undirected": "0", + "load.delimiter": " ", + "load.vertex_files": "{\"127.0.0.1\":\"" + "test/case/vertex/vertex_[0," + strconv.Itoa(num-1) + "]" + "\"}", + "load.edge_files": "{\"127.0.0.1\":\"" + "test/case/edge/edge_[0," + strconv.Itoa(num-1) + "]" + "\"}", + "load.vertex_backend": vertexBackends[rand.Intn(len(vertexBackends))], + } +} diff --git a/vermeer/test/scheduler/batch.go b/vermeer/test/scheduler/batch.go new file mode 100644 index 000000000..7afaad7a8 --- /dev/null +++ b/vermeer/test/scheduler/batch.go @@ -0,0 +1,14 @@ +package scheduler + +import ( + "testing" + "vermeer/client" + "vermeer/test/functional" +) + +func TestBatch(t *testing.T, expectRes *functional.ExpectRes, healthCheck *functional.HealthCheck, masterHttp *client.VermeerClient, graphName []string, factor string, waitSecond int) { + // TEST GROUP: BATCH + // 1. send batch tasks to single graph + // expect: the tasks should be executed in order of time + // have been tested in priority.go +} diff --git a/vermeer/test/scheduler/priority.go b/vermeer/test/scheduler/priority.go new file mode 100644 index 000000000..6d2c5c799 --- /dev/null +++ b/vermeer/test/scheduler/priority.go @@ -0,0 +1,108 @@ +package scheduler + +import ( + "fmt" + "testing" + "time" + + "vermeer/client" + "vermeer/test/functional" + + "github.com/sirupsen/logrus" + "github.com/stretchr/testify/require" +) + +func SubTestPriority(t *testing.T, expectRes *functional.ExpectRes, healthCheck *functional.HealthCheck, masterHttp *client.VermeerClient, graphName []string, computeTask string, waitSecond int) { + fmt.Printf("Test Priority start with task: %s\n", computeTask) + bTime := time.Now() + computeTest, err := functional.MakeComputeTask(computeTask) + require.NoError(t, err) + computeTest.Init(graphName[0], computeTask, expectRes, waitSecond, masterHttp, t, healthCheck) + taskComputeBody := computeTest.TaskComputeBody() + + // send two tasks with different priority + params := make([]map[string]string, 0) + + for i := 0; i < 2; i++ { + param := make(map[string]string) + param["priority"] = fmt.Sprintf("%d", i) + for k, v := range taskComputeBody { + param[k] = v + } + params = append(params, param) + } + + logrus.Infof("params for priority test: %+v", params) + + taskids, sequence := computeTest.SendComputeReqAsyncBatchPriority(params) // send multiple requests asynchronously with priority + + require.Equal(t, 2, len(sequence)) + for i := 0; i < 2; i++ { + require.Equal(t, taskids[1-i], sequence[i]) // expect task with priority 1 executed before priority 0 + } + + computeTest.CheckRes() + fmt.Printf("Test Priority: %-30s [OK], cost: %v\n", computeTask, time.Since(bTime)) +} + +func SubTestSmall(t *testing.T, expectRes *functional.ExpectRes, healthCheck *functional.HealthCheck, masterHttp *client.VermeerClient, graphName []string, computeTask string, waitSecond int) { + fmt.Printf("Test Small start with task: %s\n", computeTask) + bTime := time.Now() + computeTest, err := functional.MakeComputeTask(computeTask) + computeTaskSmall, err := functional.MakeComputeTask(computeTask) + require.NoError(t, err) + computeTest.Init(graphName[0], computeTask, expectRes, waitSecond, masterHttp, t, healthCheck) + taskComputeBody := computeTest.TaskComputeBody() + computeTaskSmall.Init(graphName[1], computeTask, expectRes, waitSecond, masterHttp, t, healthCheck) + taskComputeBodySmall := computeTaskSmall.TaskComputeBody() + + // send two tasks with different size + params := make([]map[string]string, 0) + taskComputeBody["graph_name"] = graphName[0] + taskComputeBodySmall["graph_name"] = graphName[1] + params = append(params, taskComputeBody) + params = append(params, taskComputeBodySmall) + + logrus.Infof("params for small test: %+v", params) + + taskids, sequence := computeTest.SendComputeReqAsyncBatchPriority(params) // send multiple requests asynchronously with priority + + require.Equal(t, 2, len(sequence)) + for i := 0; i < 2; i++ { + require.Equal(t, taskids[1-i], sequence[i]) // expect task smaller executed before larger + } + + computeTest.CheckRes() + fmt.Printf("Test Small: %-30s [OK], cost: %v\n", computeTask, time.Since(bTime)) +} + +func TestPriority(t *testing.T, expectRes *functional.ExpectRes, healthCheck *functional.HealthCheck, masterHttp *client.VermeerClient, graphName []string, factor string, waitSecond int) { + fmt.Print("start test priority\n") + + // for scheduler, just test a simple task + var computeTask = "pagerank" + + // TEST GROUP: PRIORITY + // 1. send priority tasks to single graph + // expect: the tasks should be executed in order of priority + + SubTestPriority(t, expectRes, healthCheck, masterHttp, graphName, computeTask, waitSecond) + + // 2. send small tasks and large tasks to single graph + // expect: the small tasks should be executed first + + SubTestSmall(t, expectRes, healthCheck, masterHttp, graphName, computeTask, waitSecond) + + // 3. send support concurrent tasks to single graph + // expect: the tasks should be executed concurrently + + // 4. send dependency-tasks to single graph + // expect: the tasks should be executed in order of dependency + + // 5. send same priority tasks to single graph + // expect: the tasks should be executed in order of time + + // 6. send tasks to different graphs + // expect: the tasks should be executed concurrently + // have been tested in SubTestSmall +} diff --git a/vermeer/test/scheduler/routine.go b/vermeer/test/scheduler/routine.go new file mode 100644 index 000000000..59d403d3e --- /dev/null +++ b/vermeer/test/scheduler/routine.go @@ -0,0 +1,13 @@ +package scheduler + +import ( + "testing" + "vermeer/client" + "vermeer/test/functional" +) + +func TestRoutine(t *testing.T, expectRes *functional.ExpectRes, healthCheck *functional.HealthCheck, masterHttp *client.VermeerClient, graphName []string, factor string, waitSecond int) { + // TEST GROUP: ROUTINE + // 1. send tasks to single graph + // expect: the tasks should be executed timely +} diff --git a/vermeer/test/scheduler/test_scheduler.go b/vermeer/test/scheduler/test_scheduler.go new file mode 100644 index 000000000..84c2abf37 --- /dev/null +++ b/vermeer/test/scheduler/test_scheduler.go @@ -0,0 +1,60 @@ +package scheduler + +import ( + "fmt" + "net/http" + "testing" + "time" + + "github.com/stretchr/testify/require" + + "vermeer/client" + "vermeer/test/functional" +) + +func TestScheduler(t *testing.T, expectResPath string, masterHttpAddr string, graphName string, factor string, waitSecond int) { + fmt.Print("start test scheduler\n") + + startTime := time.Now() + expectRes, err := functional.GetExpectRes(expectResPath) + require.NoError(t, err) + + masterHttp := client.VermeerClient{} + masterHttp.Init("http://"+masterHttpAddr, http.DefaultClient) + + // health check + healthCheck := functional.HealthCheck{} + healthCheck.Init(t, &masterHttp) + healthCheck.DoHealthCheck() + + // load graph first + loadTest1 := functional.LoadTaskLocal{} + loadTest1.Init(graphName+"_1", expectRes, &masterHttp, waitSecond, t, &healthCheck) + loadTest1.SendLoadRequest(loadTest1.TaskLoadBodyWithNum(0)) + loadTest1.CheckGraph() + + loadTest2 := functional.LoadTaskLocal{} + loadTest2.Init(graphName+"_2", expectRes, &masterHttp, waitSecond, t, &healthCheck) + loadTest2.SendLoadRequest(loadTest2.TaskLoadBodyWithNum(20)) + // loadTest2.CheckGraph() + + TestPriority(t, expectRes, &healthCheck, &masterHttp, []string{graphName + "_1", graphName + "_2"}, factor, waitSecond) + + TestBatch(t, expectRes, &healthCheck, &masterHttp, []string{graphName + "_1"}, factor, waitSecond) + + TestRoutine(t, expectRes, &healthCheck, &masterHttp, []string{graphName + "_2"}, factor, waitSecond) + + // Error handling: cancel task + cancelTask := functional.CancelTask{} + cancelTask.CancelTask(t, &masterHttp, graphName+"_1") + cancelTask.CancelTask(t, &masterHttp, graphName+"_2") + fmt.Print("test cancel task [OK]\n") + + // Finally, delete graph + deleteGraph := functional.DeleteGraph{} + deleteGraph.DeleteGraph(t, &masterHttp, graphName+"_1") + deleteGraph.DeleteGraph(t, &masterHttp, graphName+"_2") + fmt.Print("test delete graph [OK]\n") + + fmt.Printf("client test finished, cost time:%v\n", time.Since(startTime)) +} diff --git a/vermeer/vermeer_test.go b/vermeer/vermeer_test.go index 86c2ecfde..0be370c07 100644 --- a/vermeer/vermeer_test.go +++ b/vermeer/vermeer_test.go @@ -31,6 +31,7 @@ import ( "vermeer/client" "vermeer/test/functional" + "vermeer/test/scheduler" ) var ( @@ -95,6 +96,8 @@ func TestVermeer(t *testing.T) { t.Run("algorithms", testAlgorithms) case "function": t.Run("function", testFunction) + case "scheduler": + t.Run("scheduler", testScheduler) } } @@ -102,6 +105,10 @@ func testFunction(t *testing.T) { functional.TestFunction(t, expectResPath, masterHttpAddr, graphName, factor, waitSecond) } +func testScheduler(t *testing.T) { + scheduler.TestScheduler(t, expectResPath, masterHttpAddr, graphName, factor, waitSecond) +} + func testAlgorithms(t *testing.T) { // todo: 增加算法名称 // var computeTasks = []string{"pagerank", "lpa", "wcc", "degree_out", "degree_in", "degree_both", "triangle_count", @@ -158,8 +165,8 @@ func testAlgorithms(t *testing.T) { taskComputeBody := computeTest.TaskComputeBody() taskComputeBody["output.need_query"] = needQuery if sendType == "async" { - // computeTest.SendComputeReqAsync(taskComputeBody) - computeTest.SendComputeReqAsyncBatchPriority(10, taskComputeBody) // 异步发送多个请求 + computeTest.SendComputeReqAsync(taskComputeBody) + // computeTest.SendComputeReqAsyncBatchPriority(10, taskComputeBody) // 异步发送多个请求 } else { computeTest.SendComputeReqSync(taskComputeBody) } From aca299475efc1a49cfce97dac65461eb96dd0ba1 Mon Sep 17 00:00:00 2001 From: ethereal Date: Thu, 11 Sep 2025 13:40:31 +0800 Subject: [PATCH 18/27] chore: add some test for priority --- vermeer/apps/master/bl/task_bl.go | 3 + vermeer/apps/master/master_main.go | 1 + .../schedules/scheduler_algorithm_manager.go | 8 +- .../schedules/scheduler_resource_manager.go | 5 +- vermeer/client/client.go | 20 ++++ vermeer/config/master.ini | 1 + vermeer/config/worker04.ini | 23 ++++ vermeer/test/functional/compute_base.go | 11 ++ vermeer/test/functional/compute_task.go | 1 + vermeer/test/scheduler/priority.go | 106 +++++++++++++++--- vermeer/test/scheduler/routine.go | 27 +++++ 11 files changed, 182 insertions(+), 24 deletions(-) create mode 100644 vermeer/config/worker04.ini diff --git a/vermeer/apps/master/bl/task_bl.go b/vermeer/apps/master/bl/task_bl.go index 9070a6f29..f10417926 100644 --- a/vermeer/apps/master/bl/task_bl.go +++ b/vermeer/apps/master/bl/task_bl.go @@ -93,6 +93,9 @@ func (tb *TaskBl) CreateTaskInfo( logrus.Warnf("exclusive convert to bool error:%v", err) } } + if cronExpr, ok := params["cron_expr"]; ok { + taskInfo.CronExpr = cronExpr + } } return taskInfo, nil diff --git a/vermeer/apps/master/master_main.go b/vermeer/apps/master/master_main.go index 2da20a2fd..91cd33180 100644 --- a/vermeer/apps/master/master_main.go +++ b/vermeer/apps/master/master_main.go @@ -56,6 +56,7 @@ func Main() { services.SetUI(sen) logrus.Info("token-auth was activated") default: + services.SetAdminRouters(sen, auth.NoneAuthFilter) services.SetRouters(sen, auth.NoneAuthFilter) logrus.Warn("No authentication was activated.") } diff --git a/vermeer/apps/master/schedules/scheduler_algorithm_manager.go b/vermeer/apps/master/schedules/scheduler_algorithm_manager.go index 8a4333140..aa0189bf6 100644 --- a/vermeer/apps/master/schedules/scheduler_algorithm_manager.go +++ b/vermeer/apps/master/schedules/scheduler_algorithm_manager.go @@ -32,8 +32,8 @@ func (am *SchedulerAlgorithmManager) Init() { am.schuduledSchedulerAlgorithms = make(map[string]SchedulerAlgorithm) am.dispatchPaused = false // Register filter and schedule algorithms - am.RegisterFilterAlgorithm(&WaitingSchedulerAlgorithm{}) am.RegisterFilterAlgorithm(&DependsSchedulerAlgorithm{}) + am.RegisterFilterAlgorithm(&WaitingSchedulerAlgorithm{}) // Register default SchedulerAlgorithms am.RegisterSchedulerAlgorithm(&PriorityElderSchedulerAlgorithm{}) } @@ -383,9 +383,9 @@ func (d *DependsSchedulerAlgorithm) FilterNextTasks(allTasks []*structure.TaskIn return allTasks[i].ID < allTasks[j].ID }) - waitingTaskIDs := make(map[int32]*structure.TaskInfo) + taskIDs := make(map[int32]*structure.TaskInfo) for _, task := range allTasks { - waitingTaskIDs[task.ID] = task + taskIDs[task.ID] = task } filteredTasks := make([]*structure.TaskInfo, 0) @@ -394,7 +394,7 @@ func (d *DependsSchedulerAlgorithm) FilterNextTasks(allTasks []*structure.TaskIn // Check if all dependencies are satisfied allDepsSatisfied := true for _, dep := range depends { - if depTask, exists := waitingTaskIDs[dep]; !exists || depTask.State != structure.TaskStateWaiting { + if depTask, exists := taskIDs[dep]; exists && depTask.State != structure.TaskStateComplete { allDepsSatisfied = false break } diff --git a/vermeer/apps/master/schedules/scheduler_resource_manager.go b/vermeer/apps/master/schedules/scheduler_resource_manager.go index 37cc2e9a9..35f03a342 100644 --- a/vermeer/apps/master/schedules/scheduler_resource_manager.go +++ b/vermeer/apps/master/schedules/scheduler_resource_manager.go @@ -82,7 +82,7 @@ func (rm *SchedulerResourceManager) GetAgentAndAssignTask(taskInfo *structure.Ta defer rm.Unlock(rm.Lock()) - agent, status, workers, err := rm.broker.ApplyAgent(taskInfo) + agent, status, workers, err := rm.broker.ApplyAgent(taskInfo, !taskInfo.Exclusive) if err != nil { return nil, AgentStatusError, err } @@ -93,8 +93,9 @@ func (rm *SchedulerResourceManager) GetAgentAndAssignTask(taskInfo *structure.Ta // Assign the task to the agent agent.AssignTask(taskInfo) + exclusive := taskInfo.Exclusive runningStatus := WorkerOngoingStatusRunning - if _, exists := rm.runningWorkerGroupTasks[agent.GroupName()]; !exists { + if _, exists := rm.runningWorkerGroupTasks[agent.GroupName()]; !exists && exclusive { rm.runningWorkerGroupTasks[agent.GroupName()] = []int32{} runningStatus = WorkerOngoingStatusRunning rm.workerGroupStatus[agent.GroupName()] = runningStatus diff --git a/vermeer/client/client.go b/vermeer/client/client.go index 04c50a237..34553aa3e 100644 --- a/vermeer/client/client.go +++ b/vermeer/client/client.go @@ -150,6 +150,26 @@ func (vc *VermeerClient) GetWorkers() (*WorkersResponse, error) { return workersResp, err } +func (vc *VermeerClient) AllocGroupGraph(graphName string, groupName string) (bool, error) { + reader, err := Request2Reader(struct{}{}) + if err != nil { + return false, err + } + resp, err := vc.post(vc.httpAddr+"/admin/workers/alloc/"+groupName+"/$DEFAULT/"+graphName, reader) + if err != nil { + return false, err + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + respByte, err := ParseResponse2Byte(resp) + if err != nil { + return false, err + } + return false, fmt.Errorf("response:%s", string(respByte)) + } + return true, nil +} + func (vc *VermeerClient) GetMaster() (*MasterResponse, error) { resp, err := vc.get(vc.httpAddr + "/master") if err != nil { diff --git a/vermeer/config/master.ini b/vermeer/config/master.ini index 11827608a..72695d6ed 100644 --- a/vermeer/config/master.ini +++ b/vermeer/config/master.ini @@ -25,3 +25,4 @@ task_parallel_num=1 auth=none auth_token_factor=1234 start_chan_size=10 +ticker_interval=1 diff --git a/vermeer/config/worker04.ini b/vermeer/config/worker04.ini new file mode 100644 index 000000000..8b341c8f4 --- /dev/null +++ b/vermeer/config/worker04.ini @@ -0,0 +1,23 @@ +; Licensed to the Apache Software Foundation (ASF) under one or more +; contributor license agreements. See the NOTICE file distributed with +; this work for additional information regarding copyright ownership. +; The ASF licenses this file to You under the Apache License, Version 2.0 +; (the "License"); you may not use this file except in compliance with +; the License. You may obtain a copy of the License at +; +; http://www.apache.org/licenses/LICENSE-2.0 +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, +; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +; See the License for the specific language governing permissions and +; limitations under the License. + +[default] +log_level=info +debug_mode=release +http_peer=0.0.0.0:6988 +grpc_peer=0.0.0.0:6989 +master_peer=127.0.0.1:6689 +run_mode=worker +worker_group=test \ No newline at end of file diff --git a/vermeer/test/functional/compute_base.go b/vermeer/test/functional/compute_base.go index 5bdb1d64c..e745aa65e 100644 --- a/vermeer/test/functional/compute_base.go +++ b/vermeer/test/functional/compute_base.go @@ -97,6 +97,17 @@ func (ctb *ComputeTaskBase) SendComputeReqAsync(params map[string]string) { require.Equal(ctb.t, "complete", taskResp.Task.Status) } +func (ctb *ComputeTaskBase) SendComputeReqAsyncNotWait(params map[string]string) int32 { + //create Compute Task + resp, err := ctb.masterHttp.CreateTaskAsync(client.TaskCreateRequest{ + TaskType: "compute", + GraphName: ctb.graphName, + Params: params, + }) + require.NoError(ctb.t, err) + return int32(resp.Task.ID) +} + func (ctb *ComputeTaskBase) SendComputeReqAsyncBatchPriority(params []map[string]string) ([]int32, []int32) { //create Compute Task tasks := make([]client.TaskInfo, 0, len(params)) diff --git a/vermeer/test/functional/compute_task.go b/vermeer/test/functional/compute_task.go index 3ab529666..9373fe153 100644 --- a/vermeer/test/functional/compute_task.go +++ b/vermeer/test/functional/compute_task.go @@ -36,6 +36,7 @@ type ComputeTask interface { masterHttp *client.VermeerClient, t *testing.T, healthCheck *HealthCheck) TaskComputeBody() map[string]string SendComputeReqAsync(params map[string]string) + SendComputeReqAsyncNotWait(params map[string]string) int32 SendComputeReqAsyncBatchPriority(params []map[string]string) ([]int32, []int32) SendComputeReqSync(params map[string]string) LoadComputeRes() ([]interface{}, error) diff --git a/vermeer/test/scheduler/priority.go b/vermeer/test/scheduler/priority.go index 6d2c5c799..9c4c01de0 100644 --- a/vermeer/test/scheduler/priority.go +++ b/vermeer/test/scheduler/priority.go @@ -76,33 +76,103 @@ func SubTestSmall(t *testing.T, expectRes *functional.ExpectRes, healthCheck *fu fmt.Printf("Test Small: %-30s [OK], cost: %v\n", computeTask, time.Since(bTime)) } +func SubTestConcurrent(t *testing.T, expectRes *functional.ExpectRes, healthCheck *functional.HealthCheck, masterHttp *client.VermeerClient, graphName []string, computeTask string, waitSecond int) { + fmt.Printf("Test Concurrent start with task: %s\n", computeTask) + bTime := time.Now() + computeTest, err := functional.MakeComputeTask(computeTask) + require.NoError(t, err) + computeTest.Init(graphName[1], computeTask, expectRes, waitSecond, masterHttp, t, healthCheck) + taskComputeBody := computeTest.TaskComputeBody() + + // send two tasks with different size + params := make([]map[string]string, 0) + taskComputeBody["exclusive"] = "false" + params = append(params, taskComputeBody) + params = append(params, taskComputeBody) + + logrus.Infof("params for concurrent test: %+v", params) + + _, sequence := computeTest.SendComputeReqAsyncBatchPriority(params) // send multiple requests asynchronously with priority + + require.Equal(t, 2, len(sequence)) + + fmt.Printf("Test Concurrent: %-30s [OK], cost: %v\n", computeTask, time.Since(bTime)) + // cost should be less than 2 * single task time +} + +func SubTestDepends(t *testing.T, expectRes *functional.ExpectRes, healthCheck *functional.HealthCheck, masterHttp *client.VermeerClient, graphName []string, computeTask string, waitSecond int) { + fmt.Printf("Test Depends start with task: %s\n", computeTask) + bTime := time.Now() + computeTest, err := functional.MakeComputeTask(computeTask) + require.NoError(t, err) + computeTest.Init(graphName[0], computeTask, expectRes, waitSecond, masterHttp, t, healthCheck) + taskComputeBody := computeTest.TaskComputeBody() + + // first alloc worker 4 for graph 3 + masterHttp.AllocGroupGraph(graphName[0]+"_3", "test") + + loadTest3 := functional.LoadTaskLocal{} + loadTest3.Init(graphName[0]+"_3", expectRes, masterHttp, waitSecond, t, healthCheck) + loadTest3.SendLoadRequest(loadTest3.TaskLoadBodyWithNum(10)) + + // send a large task to $ worker group + taskid := computeTest.SendComputeReqAsyncNotWait(taskComputeBody) + + // send two tasks with different dependency to the same graph + taskComputeBody["graph_name"] = graphName[0] + "_3" + params := make([]map[string]string, 0) + new_body := make(map[string]string) + for k, v := range taskComputeBody { + new_body[k] = v + } + new_body["preorders"] = fmt.Sprintf("%d", taskid) + params = append(params, new_body) + params = append(params, taskComputeBody) + + logrus.Infof("params for depends test: %+v", params) + + taskids, sequence := computeTest.SendComputeReqAsyncBatchPriority(params) // send multiple requests asynchronously with priority + + require.Equal(t, 2, len(sequence)) + for i := 0; i < 2; i++ { + require.Equal(t, taskids[1-i], sequence[i]) // expect task not depend executed first + } + + // computeTest.CheckRes() + fmt.Printf("Test Depends: %-30s [OK], cost: %v\n", computeTask, time.Since(bTime)) +} + func TestPriority(t *testing.T, expectRes *functional.ExpectRes, healthCheck *functional.HealthCheck, masterHttp *client.VermeerClient, graphName []string, factor string, waitSecond int) { fmt.Print("start test priority\n") - // for scheduler, just test a simple task - var computeTask = "pagerank" + // // for scheduler, just test a simple task + // var computeTask = "pagerank" + + // // TEST GROUP: PRIORITY + // // 1. send priority tasks to single graph + // // expect: the tasks should be executed in order of priority - // TEST GROUP: PRIORITY - // 1. send priority tasks to single graph - // expect: the tasks should be executed in order of priority + // SubTestPriority(t, expectRes, healthCheck, masterHttp, graphName, computeTask, waitSecond) - SubTestPriority(t, expectRes, healthCheck, masterHttp, graphName, computeTask, waitSecond) + // // 2. send small tasks and large tasks to single graph + // // expect: the small tasks should be executed first - // 2. send small tasks and large tasks to single graph - // expect: the small tasks should be executed first + // SubTestSmall(t, expectRes, healthCheck, masterHttp, graphName, computeTask, waitSecond) - SubTestSmall(t, expectRes, healthCheck, masterHttp, graphName, computeTask, waitSecond) + // // 3. send support concurrent tasks to single graph + // // expect: the tasks should be executed concurrently + // SubTestConcurrent(t, expectRes, healthCheck, masterHttp, graphName, computeTask, waitSecond) - // 3. send support concurrent tasks to single graph - // expect: the tasks should be executed concurrently + // // 4. send dependency-tasks to single graph + // // expect: the tasks should be executed in order of dependency - // 4. send dependency-tasks to single graph - // expect: the tasks should be executed in order of dependency + // SubTestDepends(t, expectRes, healthCheck, masterHttp, graphName, computeTask, waitSecond) - // 5. send same priority tasks to single graph - // expect: the tasks should be executed in order of time + // // 5. send same priority tasks to single graph + // // expect: the tasks should be executed in order of time + // // skipped, too fragile - // 6. send tasks to different graphs - // expect: the tasks should be executed concurrently - // have been tested in SubTestSmall + // // 6. send tasks to different graphs + // // expect: the tasks should be executed concurrently + // // have been tested in SubTestSmall and SubTestDepends } diff --git a/vermeer/test/scheduler/routine.go b/vermeer/test/scheduler/routine.go index 59d403d3e..c61bb425f 100644 --- a/vermeer/test/scheduler/routine.go +++ b/vermeer/test/scheduler/routine.go @@ -1,13 +1,40 @@ package scheduler import ( + "fmt" "testing" + "time" "vermeer/client" "vermeer/test/functional" + + "github.com/sirupsen/logrus" + "github.com/stretchr/testify/require" ) +func SubTestRoutine(t *testing.T, expectRes *functional.ExpectRes, healthCheck *functional.HealthCheck, masterHttp *client.VermeerClient, graphName []string, computeTask string, waitSecond int) { + fmt.Printf("Test Routine start with task: %s\n", computeTask) + bTime := time.Now() + computeTest, err := functional.MakeComputeTask(computeTask) + require.NoError(t, err) + computeTest.Init(graphName[0], computeTask, expectRes, waitSecond, masterHttp, t, healthCheck) + taskComputeBody := computeTest.TaskComputeBody() + + taskComputeBody["cron_expr"] = "*/1 * * * * *" // every second + + logrus.Infof("params for routine test: %+v", taskComputeBody) + + computeTest.SendComputeReqAsync(taskComputeBody) + computeTest.CheckRes() + + // wait for a while and check again + time.Sleep(10 * time.Second) + fmt.Printf("Test Routine: %-30s [OK], cost: %v\n", computeTask, time.Since(bTime)) +} + func TestRoutine(t *testing.T, expectRes *functional.ExpectRes, healthCheck *functional.HealthCheck, masterHttp *client.VermeerClient, graphName []string, factor string, waitSecond int) { // TEST GROUP: ROUTINE // 1. send tasks to single graph // expect: the tasks should be executed timely + + SubTestRoutine(t, expectRes, healthCheck, masterHttp, graphName, factor, waitSecond) } From 17b98ef7275932a79a5cd25182f57877ea5a5eed Mon Sep 17 00:00:00 2001 From: ethereal Date: Sat, 13 Sep 2025 22:08:42 +0800 Subject: [PATCH 19/27] chore: add some test for routine --- vermeer/apps/master/bl/scheduler_bl.go | 15 +++++- vermeer/apps/master/bl/task_bl.go | 3 ++ .../schedules/scheduler_cron_manager.go | 50 ++++++++++++++++++- vermeer/test/scheduler/priority.go | 1 + vermeer/test/scheduler/routine.go | 16 ++++-- 5 files changed, 77 insertions(+), 8 deletions(-) diff --git a/vermeer/apps/master/bl/scheduler_bl.go b/vermeer/apps/master/bl/scheduler_bl.go index 146f24364..d8e0712bf 100644 --- a/vermeer/apps/master/bl/scheduler_bl.go +++ b/vermeer/apps/master/bl/scheduler_bl.go @@ -244,10 +244,11 @@ func (s *ScheduleBl) CloseCurrent(taskId int32, removeWorkerName ...string) erro s.taskManager.RemoveTask(taskId) // release the worker group s.resourceManager.ReleaseByTaskID(taskId) - // stop the cron job if exists - s.cronManager.DeleteTask(taskId) if len(removeWorkerName) > 0 { + // stop the cron job if exists + s.cronManager.DeleteTask(taskId) + // remove the worker from resource manager workerName := removeWorkerName[0] if workerName == "" { return errors.New("the argument `removeWorkerName` is empty") @@ -408,6 +409,16 @@ func (s *ScheduleBl) handleCancelTask(taskInfo *structure.TaskInfo) error { return nil } +func (s *ScheduleBl) CancelCronTask(taskInfo *structure.TaskInfo) error { + if taskInfo == nil { + return errors.New("the argument `taskInfo` is nil") + } + + s.cronManager.DeleteTask(taskInfo.ID) + + return nil +} + // ** Other Methods ** func (s *ScheduleBl) PeekSpaceTail(space string) *structure.TaskInfo { diff --git a/vermeer/apps/master/bl/task_bl.go b/vermeer/apps/master/bl/task_bl.go index f10417926..63bd25f03 100644 --- a/vermeer/apps/master/bl/task_bl.go +++ b/vermeer/apps/master/bl/task_bl.go @@ -182,6 +182,9 @@ func (tb *TaskBl) CancelTask(taskID int32) error { return fmt.Errorf("cannot cancel the task with id '%v' as it was not created by you", taskID) } + // stop the cron job if exists + Scheduler.CancelCronTask(task) + if task.State == structure.TaskStateCanceled { return fmt.Errorf("task had been in state canceled") } diff --git a/vermeer/apps/master/schedules/scheduler_cron_manager.go b/vermeer/apps/master/schedules/scheduler_cron_manager.go index 6e3b633fd..c186356c1 100644 --- a/vermeer/apps/master/schedules/scheduler_cron_manager.go +++ b/vermeer/apps/master/schedules/scheduler_cron_manager.go @@ -27,6 +27,7 @@ func (t *SchedulerCronManager) CheckCronExpression(cronExpr string) error { return errors.New("cron expression is empty") } if _, err := cron.ParseStandard(cronExpr); err != nil { + logrus.Errorf("Failed to parse cron expression: %v", err) return errors.New("invalid cron expression: " + err.Error()) } return nil @@ -41,22 +42,44 @@ func (t *SchedulerCronManager) AddCronTask(taskInfo *structure.TaskInfo) error { return errors.New("the property `CronExpr` of taskInfo is empty") } + // add to cron tasks t.cronTasks[taskInfo.ID] = append(t.cronTasks[taskInfo.ID], taskInfo) cronJob := cron.New() _, err := cronJob.AddFunc(taskInfo.CronExpr, func() { if taskInfo == nil { return } - if _, err := t.queueHandler(taskInfo); err != nil { + + // TODO: CREATE a new task from the original task, using taskbl + // copy a new taskInfo + task, err := structure.TaskManager.CreateTask(taskInfo.SpaceName, taskInfo.Type, 0) + task.CreateType = structure.TaskCreateAsync + task.GraphName = taskInfo.GraphName + task.CreateUser = taskInfo.CreateUser + task.Params = taskInfo.Params + task.CronExpr = "" // clear cron expression for the new task + task.Priority = taskInfo.Priority + task.Preorders = taskInfo.Preorders + task.Exclusive = taskInfo.Exclusive + if err != nil { + logrus.Errorf("Failed to create task from cron job for task %d: %v", taskInfo.ID, err) + return + } + structure.TaskManager.AddTask(task) + structure.TaskManager.SaveTask(task.ID) + if _, err := t.queueHandler(task); err != nil { logrus.Errorf("Failed to queue task %d in cron job: %v", taskInfo.ID, err) return } + logrus.Infof("Successfully queued task %d from cron job", task.ID) }) if err != nil { logrus.Errorf("Failed to add cron job for task %d: %v", taskInfo.ID, err) return err } t.crons[taskInfo.ID] = append(t.crons[taskInfo.ID], cronJob) + cronJob.Start() + logrus.Infof("Added cron task for task ID %d with expression %s", taskInfo.ID, taskInfo.CronExpr) return nil } @@ -73,3 +96,28 @@ func (t *SchedulerCronManager) DeleteTask(taskID int32) error { logrus.Infof("Deleted cron task for task ID %d", taskID) return nil } + +func (t *SchedulerCronManager) DeleteTaskByGraph(spaceName, graphName string) error { + if spaceName == "" || graphName == "" { + return errors.New("the argument `spaceName` or `graphName` is empty") + } + + var toDelete []int32 + for taskID, tasks := range t.cronTasks { + for _, task := range tasks { + if task.SpaceName == spaceName && task.GraphName == graphName { + toDelete = append(toDelete, taskID) + break + } + } + } + + for _, taskID := range toDelete { + if err := t.DeleteTask(taskID); err != nil { + logrus.Errorf("Failed to delete cron task for task ID %d: %v", taskID, err) + return err + } + } + logrus.Infof("Deleted cron tasks for space %s and graph %s", spaceName, graphName) + return nil +} diff --git a/vermeer/test/scheduler/priority.go b/vermeer/test/scheduler/priority.go index 9c4c01de0..61c354fde 100644 --- a/vermeer/test/scheduler/priority.go +++ b/vermeer/test/scheduler/priority.go @@ -86,6 +86,7 @@ func SubTestConcurrent(t *testing.T, expectRes *functional.ExpectRes, healthChec // send two tasks with different size params := make([]map[string]string, 0) + // default is false, actually do not need to set taskComputeBody["exclusive"] = "false" params = append(params, taskComputeBody) params = append(params, taskComputeBody) diff --git a/vermeer/test/scheduler/routine.go b/vermeer/test/scheduler/routine.go index c61bb425f..9449660f4 100644 --- a/vermeer/test/scheduler/routine.go +++ b/vermeer/test/scheduler/routine.go @@ -19,22 +19,28 @@ func SubTestRoutine(t *testing.T, expectRes *functional.ExpectRes, healthCheck * computeTest.Init(graphName[0], computeTask, expectRes, waitSecond, masterHttp, t, healthCheck) taskComputeBody := computeTest.TaskComputeBody() - taskComputeBody["cron_expr"] = "*/1 * * * * *" // every second + // every 1 minute + taskComputeBody["cron_expr"] = "* * * * *" logrus.Infof("params for routine test: %+v", taskComputeBody) - computeTest.SendComputeReqAsync(taskComputeBody) - computeTest.CheckRes() + taskid := computeTest.SendComputeReqAsyncNotWait(taskComputeBody) + // computeTest.CheckRes() // wait for a while and check again - time.Sleep(10 * time.Second) + time.Sleep(2 * time.Minute) + + masterHttp.GetTaskCancel(int(taskid)) + fmt.Printf("Test Routine: %-30s [OK], cost: %v\n", computeTask, time.Since(bTime)) } func TestRoutine(t *testing.T, expectRes *functional.ExpectRes, healthCheck *functional.HealthCheck, masterHttp *client.VermeerClient, graphName []string, factor string, waitSecond int) { + var computeTask = "pagerank" + // TEST GROUP: ROUTINE // 1. send tasks to single graph // expect: the tasks should be executed timely - SubTestRoutine(t, expectRes, healthCheck, masterHttp, graphName, factor, waitSecond) + SubTestRoutine(t, expectRes, healthCheck, masterHttp, graphName, computeTask, waitSecond) } From ec5c1d22cc6cddbbd470c0025d098b52ebc8759b Mon Sep 17 00:00:00 2001 From: ethereal Date: Mon, 15 Sep 2025 01:19:06 +0800 Subject: [PATCH 20/27] chore: add some test for routine --- vermeer/apps/master/bl/scheduler_bl.go | 32 ++++++++++++-- vermeer/apps/master/bl/task_bl.go | 1 + vermeer/apps/master/bl/task_creator.go | 22 ++++++++++ .../schedules/scheduler_cron_manager.go | 28 ++++--------- vermeer/test/scheduler/priority.go | 42 +++++++++---------- vermeer/test/scheduler/routine.go | 8 ++++ 6 files changed, 87 insertions(+), 46 deletions(-) diff --git a/vermeer/apps/master/bl/scheduler_bl.go b/vermeer/apps/master/bl/scheduler_bl.go index d8e0712bf..d5db54a1b 100644 --- a/vermeer/apps/master/bl/scheduler_bl.go +++ b/vermeer/apps/master/bl/scheduler_bl.go @@ -59,7 +59,7 @@ func (s *ScheduleBl) Init() { s.algorithmManager = &schedules.SchedulerAlgorithmManager{} s.algorithmManager.Init() s.cronManager = &schedules.SchedulerCronManager{} - s.cronManager.Init(s.QueueTask) + s.cronManager.Init(s.QueueTaskFromTemplate) go s.startTicker() go s.waitingStartedTask() } @@ -186,7 +186,7 @@ func (s *ScheduleBl) QueueTask(taskInfo *structure.TaskInfo) (bool, error) { return false, errors.New("the property `SpaceName` of taskInfo is empty") } - //defer s.Unlock(s.Lock()) + defer s.Unlock(s.Lock()) if err := taskMgr.SetState(taskInfo, structure.TaskStateWaiting); err != nil { return false, err } @@ -212,6 +212,30 @@ func (s *ScheduleBl) QueueTask(taskInfo *structure.TaskInfo) (bool, error) { return ok, nil } +func (s *ScheduleBl) QueueTaskFromTemplate(template *structure.TaskInfo) (int32, error) { + if template == nil { + return -1, errors.New("the argument `template` is nil") + } + + bc := &baseCreator{} + taskInfo, err := bc.CopyTaskInfo(template) + if err != nil { + logrus.Errorf("failed to copy task info from template, template ID: %d, caused by: %v", template.ID, err) + return -1, err + } + bc.saveTaskInfo(taskInfo) + + ok, err := s.QueueTask(taskInfo) + if err != nil || !ok { + logrus.Errorf("failed to queue task from template, template ID: %d, caused by: %v", template.ID, err) + return -1, err + } + + logrus.Infof("queued task '%d' from template '%d'", taskInfo.ID, template.ID) + + return taskInfo.ID, nil +} + func (s *ScheduleBl) BatchQueueTask(taskInfos []*structure.TaskInfo) ([]bool, []error) { if len(taskInfos) == 0 { return []bool{}, []error{} @@ -220,7 +244,7 @@ func (s *ScheduleBl) BatchQueueTask(taskInfos []*structure.TaskInfo) ([]bool, [] s.PauseDispatch() defer s.ResumeDispatch() - defer s.Unlock(s.Lock()) + // defer s.Unlock(s.Lock()) errors := make([]error, len(taskInfos)) oks := make([]bool, len(taskInfos)) @@ -246,7 +270,7 @@ func (s *ScheduleBl) CloseCurrent(taskId int32, removeWorkerName ...string) erro s.resourceManager.ReleaseByTaskID(taskId) if len(removeWorkerName) > 0 { - // stop the cron job if exists + // stop the cron job if exists when need remove worker, otherwise the task is just closed normally s.cronManager.DeleteTask(taskId) // remove the worker from resource manager workerName := removeWorkerName[0] diff --git a/vermeer/apps/master/bl/task_bl.go b/vermeer/apps/master/bl/task_bl.go index 63bd25f03..75da02dbd 100644 --- a/vermeer/apps/master/bl/task_bl.go +++ b/vermeer/apps/master/bl/task_bl.go @@ -68,6 +68,7 @@ func (tb *TaskBl) CreateTaskInfo( taskInfo.Priority = 0 taskInfo.Preorders = make([]int32, 0) taskInfo.Exclusive = false // default to false, can be set to true if needed + // taskInfo.Exclusive = true if params != nil { if priority, ok := params["priority"]; ok { if p, err := strconv.Atoi(priority); err == nil { diff --git a/vermeer/apps/master/bl/task_creator.go b/vermeer/apps/master/bl/task_creator.go index ca3f946cd..a7353c85b 100644 --- a/vermeer/apps/master/bl/task_creator.go +++ b/vermeer/apps/master/bl/task_creator.go @@ -99,6 +99,28 @@ func (bc *baseCreator) NewTaskInfo(graphName string, params map[string]string, t return task, nil } +func (bc *baseCreator) CopyTaskInfo(src *structure.TaskInfo) (*structure.TaskInfo, error) { + if src == nil { + return nil, fmt.Errorf("the argument `src` should not be nil") + } + + task, err := taskMgr.CreateTask(src.SpaceName, src.Type, 0) + if err != nil { + return nil, err + } + + task.CreateType = structure.TaskCreateAsync + task.GraphName = src.GraphName + task.CreateUser = src.CreateUser + task.Params = src.Params + task.CronExpr = "" // clear cron expression for the new task + task.Priority = src.Priority + task.Preorders = src.Preorders + task.Exclusive = src.Exclusive + + return task, nil +} + func (bc *baseCreator) saveTaskInfo(task *structure.TaskInfo) (*structure.TaskInfo, error) { if _, err := taskMgr.AddTask(task); err != nil { logrus.Errorf("failed to add a task to `TaskManager`, task: %v, cased by: %v", task, err) diff --git a/vermeer/apps/master/schedules/scheduler_cron_manager.go b/vermeer/apps/master/schedules/scheduler_cron_manager.go index c186356c1..f33ccb10e 100644 --- a/vermeer/apps/master/schedules/scheduler_cron_manager.go +++ b/vermeer/apps/master/schedules/scheduler_cron_manager.go @@ -11,14 +11,14 @@ import ( type SchedulerCronManager struct { cronTasks map[int32][]*structure.TaskInfo // cron expression to TaskInfo. Origin task ID to copied tasks crons map[int32][]*cron.Cron // cron expression to cron jobs - // queueHandler is a function that handles the task queue - queueHandler func(*structure.TaskInfo) (bool, error) + // queueTemplateHandler is a function that handles the task queue + queueTemplateHandler func(*structure.TaskInfo) (int32, error) } -func (t *SchedulerCronManager) Init(queueHandler func(*structure.TaskInfo) (bool, error)) *SchedulerCronManager { +func (t *SchedulerCronManager) Init(queueTemplateHandler func(*structure.TaskInfo) (int32, error)) *SchedulerCronManager { t.cronTasks = make(map[int32][]*structure.TaskInfo) t.crons = make(map[int32][]*cron.Cron) - t.queueHandler = queueHandler + t.queueTemplateHandler = queueTemplateHandler return t } @@ -50,28 +50,14 @@ func (t *SchedulerCronManager) AddCronTask(taskInfo *structure.TaskInfo) error { return } - // TODO: CREATE a new task from the original task, using taskbl + // CREATE a new task from the original task, using taskbl, it is handled in queueTemplateHandler // copy a new taskInfo - task, err := structure.TaskManager.CreateTask(taskInfo.SpaceName, taskInfo.Type, 0) - task.CreateType = structure.TaskCreateAsync - task.GraphName = taskInfo.GraphName - task.CreateUser = taskInfo.CreateUser - task.Params = taskInfo.Params - task.CronExpr = "" // clear cron expression for the new task - task.Priority = taskInfo.Priority - task.Preorders = taskInfo.Preorders - task.Exclusive = taskInfo.Exclusive + newID, err := t.queueTemplateHandler(taskInfo) if err != nil { - logrus.Errorf("Failed to create task from cron job for task %d: %v", taskInfo.ID, err) - return - } - structure.TaskManager.AddTask(task) - structure.TaskManager.SaveTask(task.ID) - if _, err := t.queueHandler(task); err != nil { logrus.Errorf("Failed to queue task %d in cron job: %v", taskInfo.ID, err) return } - logrus.Infof("Successfully queued task %d from cron job", task.ID) + logrus.Infof("Successfully queued task %d from cron job", newID) }) if err != nil { logrus.Errorf("Failed to add cron job for task %d: %v", taskInfo.ID, err) diff --git a/vermeer/test/scheduler/priority.go b/vermeer/test/scheduler/priority.go index 61c354fde..439eb6b93 100644 --- a/vermeer/test/scheduler/priority.go +++ b/vermeer/test/scheduler/priority.go @@ -146,34 +146,34 @@ func SubTestDepends(t *testing.T, expectRes *functional.ExpectRes, healthCheck * func TestPriority(t *testing.T, expectRes *functional.ExpectRes, healthCheck *functional.HealthCheck, masterHttp *client.VermeerClient, graphName []string, factor string, waitSecond int) { fmt.Print("start test priority\n") - // // for scheduler, just test a simple task - // var computeTask = "pagerank" + // for scheduler, just test a simple task + var computeTask = "pagerank" - // // TEST GROUP: PRIORITY - // // 1. send priority tasks to single graph - // // expect: the tasks should be executed in order of priority + // TEST GROUP: PRIORITY + // 1. send priority tasks to single graph + // expect: the tasks should be executed in order of priority - // SubTestPriority(t, expectRes, healthCheck, masterHttp, graphName, computeTask, waitSecond) + SubTestPriority(t, expectRes, healthCheck, masterHttp, graphName, computeTask, waitSecond) - // // 2. send small tasks and large tasks to single graph - // // expect: the small tasks should be executed first + // 2. send small tasks and large tasks to single graph + // expect: the small tasks should be executed first - // SubTestSmall(t, expectRes, healthCheck, masterHttp, graphName, computeTask, waitSecond) + SubTestSmall(t, expectRes, healthCheck, masterHttp, graphName, computeTask, waitSecond) - // // 3. send support concurrent tasks to single graph - // // expect: the tasks should be executed concurrently - // SubTestConcurrent(t, expectRes, healthCheck, masterHttp, graphName, computeTask, waitSecond) + // 3. send support concurrent tasks to single graph + // expect: the tasks should be executed concurrently + SubTestConcurrent(t, expectRes, healthCheck, masterHttp, graphName, computeTask, waitSecond) - // // 4. send dependency-tasks to single graph - // // expect: the tasks should be executed in order of dependency + // 4. send dependency-tasks to single graph + // expect: the tasks should be executed in order of dependency - // SubTestDepends(t, expectRes, healthCheck, masterHttp, graphName, computeTask, waitSecond) + SubTestDepends(t, expectRes, healthCheck, masterHttp, graphName, computeTask, waitSecond) - // // 5. send same priority tasks to single graph - // // expect: the tasks should be executed in order of time - // // skipped, too fragile + // 5. send same priority tasks to single graph + // expect: the tasks should be executed in order of time + // skipped, too fragile - // // 6. send tasks to different graphs - // // expect: the tasks should be executed concurrently - // // have been tested in SubTestSmall and SubTestDepends + // 6. send tasks to different graphs + // expect: the tasks should be executed concurrently + // have been tested in SubTestSmall and SubTestDepends } diff --git a/vermeer/test/scheduler/routine.go b/vermeer/test/scheduler/routine.go index 9449660f4..5cb2c3b5c 100644 --- a/vermeer/test/scheduler/routine.go +++ b/vermeer/test/scheduler/routine.go @@ -30,6 +30,14 @@ func SubTestRoutine(t *testing.T, expectRes *functional.ExpectRes, healthCheck * // wait for a while and check again time.Sleep(2 * time.Minute) + // check if deployed + queue := []int32{} + queue = append(queue, int32(taskid+1)) + result, err := masterHttp.GetTaskStartSequence(queue) + require.NoError(t, err) + require.Equal(t, 1, len(result.Sequence)) + require.Greater(t, result.Sequence[0], int32(0)) + masterHttp.GetTaskCancel(int(taskid)) fmt.Printf("Test Routine: %-30s [OK], cost: %v\n", computeTask, time.Since(bTime)) From 02b83ccb53a5587d3f98b2f8ecd03c8d54b9dbef Mon Sep 17 00:00:00 2001 From: ethereal Date: Tue, 16 Sep 2025 00:36:16 +0800 Subject: [PATCH 21/27] chore: change some files/ai suggestions --- vermeer/apps/master/bl/grpc_handlers.go | 6 +- vermeer/apps/master/bl/scheduler_bl.go | 25 +++++--- vermeer/apps/master/bl/task_bl.go | 4 +- .../schedules/scheduler_algorithm_manager.go | 58 +++++++++++-------- .../schedules/scheduler_cron_manager.go | 2 +- .../schedules/scheduler_task_manager.go | 10 ++++ vermeer/config/master.ini | 2 +- vermeer/test/functional/compute_base.go | 2 +- vermeer/test/functional/load_local.go | 2 + vermeer/vermeer_test.go | 6 +- 10 files changed, 76 insertions(+), 41 deletions(-) diff --git a/vermeer/apps/master/bl/grpc_handlers.go b/vermeer/apps/master/bl/grpc_handlers.go index c5ae16987..2734c063f 100644 --- a/vermeer/apps/master/bl/grpc_handlers.go +++ b/vermeer/apps/master/bl/grpc_handlers.go @@ -100,11 +100,15 @@ func (h *ServerHandler) SayHelloMaster(ctx context.Context, req *pb.HelloMasterR } _, err = workerMgr.AddWorker(reqWorker) - Scheduler.ChangeWorkerStatus(reqWorker.Name, schedules.WorkerOngoingStatusIdle) if err != nil { logrus.Errorf("failed to add a WorkerClient to the WorkerManager, error: %s", err) return &pb.HelloMasterResp{}, err } + _, err = Scheduler.ChangeWorkerStatus(reqWorker.Name, schedules.WorkerOngoingStatusIdle) + if err != nil { + logrus.Errorf("failed to change worker status to idle, error: %s", err) + return &pb.HelloMasterResp{}, err + } logrus.Infof("worker say hello name: %s and set to workgroup: %s, client: %s", reqWorker.Name, reqWorker.Group, p.Addr.String()) diff --git a/vermeer/apps/master/bl/scheduler_bl.go b/vermeer/apps/master/bl/scheduler_bl.go index d5db54a1b..5582acf62 100644 --- a/vermeer/apps/master/bl/scheduler_bl.go +++ b/vermeer/apps/master/bl/scheduler_bl.go @@ -113,22 +113,24 @@ func (s *ScheduleBl) startTicker() { } // this make scheduler manager try to schedule next tasks -func (s *ScheduleBl) TryScheduleNextTasks() { +func (s *ScheduleBl) TryScheduleNextTasks(noLock ...bool) { defer func() { if err := recover(); err != nil { logrus.Errorln("TryScheduleNextTasks() has been recovered:", err) } }() - if err := s.tryScheduleInner(s.softSchedule); err != nil { + if err := s.tryScheduleInner(s.softSchedule, noLock...); err != nil { logrus.Errorf("do scheduling error:%v", err) } } // Main routine to schedule tasks -func (s *ScheduleBl) tryScheduleInner(softSchedule bool) error { +func (s *ScheduleBl) tryScheduleInner(softSchedule bool, noLock ...bool) error { // Implement logic to get the next task in the queue for the given space - defer s.Unlock(s.Lock()) + if !(len(noLock) > 0 && noLock[0]) { + defer s.Unlock(s.Lock()) + } // step 1: make sure all tasks have alloc to a worker group // This is done by the TaskManager, which assigns a worker group to each task @@ -264,6 +266,8 @@ func (s *ScheduleBl) BatchQueueTask(taskInfos []*structure.TaskInfo) ([]bool, [] // ******** CloseCurrent ******** func (s *ScheduleBl) CloseCurrent(taskId int32, removeWorkerName ...string) error { + defer s.Unlock(s.Lock()) + // trace tasks need these workers, check if these tasks are available s.taskManager.RemoveTask(taskId) // release the worker group @@ -282,16 +286,19 @@ func (s *ScheduleBl) CloseCurrent(taskId int32, removeWorkerName ...string) erro } logrus.Infof("invoke dispatch when task '%d' is closed", taskId) - s.TryScheduleNextTasks() + s.TryScheduleNextTasks(true) return nil } -func (s *ScheduleBl) ChangeWorkerStatus(workerName string, status schedules.WorkerOngoingStatus) { +func (s *ScheduleBl) ChangeWorkerStatus(workerName string, status schedules.WorkerOngoingStatus) (bool, error) { + defer s.Unlock(s.Lock()) s.resourceManager.ChangeWorkerStatus(workerName, status) logrus.Infof("worker '%s' status changed to '%s'", workerName, status) // After changing the worker status, we may need to reschedule tasks - s.TryScheduleNextTasks() + s.TryScheduleNextTasks(true) + + return true, nil } // ******** START TASK ******** @@ -299,7 +306,7 @@ func (s *ScheduleBl) waitingStartedTask() { for taskInfo := range s.startChan { if taskInfo == nil { logrus.Warnf("recieved a nil task from startChan") - return + continue } logrus.Infof("chan received task '%d' to start", taskInfo.ID) @@ -388,6 +395,8 @@ func (s *ScheduleBl) CancelTask(taskInfo *structure.TaskInfo) error { return errors.New("the argument `taskInfo` is nil") } + defer s.Unlock(s.Lock()) + isHeadTask := s.taskManager.IsTaskOngoing(taskInfo.ID) task := s.taskManager.RemoveTask(taskInfo.ID) s.cronManager.DeleteTask(taskInfo.ID) diff --git a/vermeer/apps/master/bl/task_bl.go b/vermeer/apps/master/bl/task_bl.go index 75da02dbd..373b0964c 100644 --- a/vermeer/apps/master/bl/task_bl.go +++ b/vermeer/apps/master/bl/task_bl.go @@ -71,7 +71,7 @@ func (tb *TaskBl) CreateTaskInfo( // taskInfo.Exclusive = true if params != nil { if priority, ok := params["priority"]; ok { - if p, err := strconv.Atoi(priority); err == nil { + if p, err := strconv.ParseInt(priority, 10, 32); err == nil { taskInfo.Priority = int32(p) } else { logrus.Warnf("priority convert to int32 error:%v", err) @@ -80,7 +80,7 @@ func (tb *TaskBl) CreateTaskInfo( if preorders, ok := params["preorders"]; ok { preorderList := strings.Split(preorders, ",") for _, preorder := range preorderList { - if pid, err := strconv.Atoi(preorder); err == nil { + if pid, err := strconv.ParseInt(preorder, 10, 32); err == nil { taskInfo.Preorders = append(taskInfo.Preorders, int32(pid)) } else { logrus.Warnf("preorder convert to int32 error:%v", err) diff --git a/vermeer/apps/master/schedules/scheduler_algorithm_manager.go b/vermeer/apps/master/schedules/scheduler_algorithm_manager.go index aa0189bf6..c92485b33 100644 --- a/vermeer/apps/master/schedules/scheduler_algorithm_manager.go +++ b/vermeer/apps/master/schedules/scheduler_algorithm_manager.go @@ -1,6 +1,7 @@ package schedules import ( + "slices" "sort" "strconv" "time" @@ -23,13 +24,13 @@ type SchedulerAlgorithm interface { type SchedulerAlgorithmManager struct { filteredSchedulerAlgorithms map[string]SchedulerAlgorithm - schuduledSchedulerAlgorithms map[string]SchedulerAlgorithm + scheduledSchedulerAlgorithms map[string]SchedulerAlgorithm dispatchPaused bool } func (am *SchedulerAlgorithmManager) Init() { am.filteredSchedulerAlgorithms = make(map[string]SchedulerAlgorithm) - am.schuduledSchedulerAlgorithms = make(map[string]SchedulerAlgorithm) + am.scheduledSchedulerAlgorithms = make(map[string]SchedulerAlgorithm) am.dispatchPaused = false // Register filter and schedule algorithms am.RegisterFilterAlgorithm(&DependsSchedulerAlgorithm{}) @@ -38,33 +39,33 @@ func (am *SchedulerAlgorithmManager) Init() { am.RegisterSchedulerAlgorithm(&PriorityElderSchedulerAlgorithm{}) } -func (am *SchedulerAlgorithmManager) RegisterSchedulerAlgorithm(SchedulerAlgorithm SchedulerAlgorithm) { - if SchedulerAlgorithm == nil { +func (am *SchedulerAlgorithmManager) RegisterSchedulerAlgorithm(schedulerAlgorithm SchedulerAlgorithm) { + if schedulerAlgorithm == nil { return } - name := SchedulerAlgorithm.Name() - if _, exists := am.schuduledSchedulerAlgorithms[name]; exists { + name := schedulerAlgorithm.Name() + if _, exists := am.scheduledSchedulerAlgorithms[name]; exists { return // SchedulerAlgorithm already registered } // only support one scheduling algorithm for now - if len(am.schuduledSchedulerAlgorithms) > 0 { + if len(am.scheduledSchedulerAlgorithms) > 0 { return // Only one scheduling algorithm can be registered } - SchedulerAlgorithm.Init() - am.schuduledSchedulerAlgorithms[name] = SchedulerAlgorithm + schedulerAlgorithm.Init() + am.scheduledSchedulerAlgorithms[name] = schedulerAlgorithm } -func (am *SchedulerAlgorithmManager) RegisterFilterAlgorithm(FilterAlgorithm SchedulerAlgorithm) { - if FilterAlgorithm == nil { +func (am *SchedulerAlgorithmManager) RegisterFilterAlgorithm(filterAlgorithm SchedulerAlgorithm) { + if filterAlgorithm == nil { return } - name := FilterAlgorithm.Name() + name := filterAlgorithm.Name() if _, exists := am.filteredSchedulerAlgorithms[name]; exists { return // SchedulerAlgorithm already registered } - FilterAlgorithm.Init() - am.filteredSchedulerAlgorithms[name] = FilterAlgorithm + filterAlgorithm.Init() + am.filteredSchedulerAlgorithms[name] = filterAlgorithm } func (am *SchedulerAlgorithmManager) IsDispatchPaused() bool { @@ -98,7 +99,7 @@ func (am *SchedulerAlgorithmManager) ScheduleNextTasks(allTasks []*structure.Tas // only support one scheduling algorithm for now // get first algorithm - for _, algorithm := range am.schuduledSchedulerAlgorithms { + for _, algorithm := range am.scheduledSchedulerAlgorithms { tasks, err := algorithm.ScheduleNextTasks(filteredTasks, taskToWorkerGroupMap, idleWorkerGroups, concurrentWorkerGroups, softSchedule) if err != nil { return nil, err @@ -268,7 +269,12 @@ func (p *PriorityElderSchedulerAlgorithm) CalculateTaskEmergency(task *structure priorityCost := priorityParam * int64(task.Priority) // step 3: resource cost graph := structure.GraphManager.GetGraphByName(task.SpaceName, task.GraphName) - resourceCost := resourceParam / max(1, graph.VertexCount+graph.EdgeCount) // Avoid division by zero, ensure at least 1 + resourceCost := int64(0) + if graph == nil { + resourceCost = resourceParam // if graph not found, use max resource cost + } else { + resourceCost = resourceParam / max(1, graph.VertexCount+graph.EdgeCount) // Avoid division by zero, ensure at least 1 + } // step 4: some random value randomValue := int64(randomValueParam) // Placeholder for any random value logic if printValue { @@ -282,13 +288,19 @@ func (p *PriorityElderSchedulerAlgorithm) ScheduleNextTasks(allTasks []*structur return nil, nil // No tasks to schedule } + // calculate emergency value for each task + taskEmergencies := make(map[int32]int64) + for _, task := range allTasks { + taskEmergencies[task.ID] = p.CalculateTaskEmergency(task, taskToWorkerGroupMap, false) + } + // Sort tasks by priority (higher priority first) sort.Slice(allTasks, func(i, j int) bool { - return p.CalculateTaskEmergency(allTasks[i], taskToWorkerGroupMap, false) > p.CalculateTaskEmergency(allTasks[j], taskToWorkerGroupMap, false) + return taskEmergencies[allTasks[i].ID] > taskEmergencies[allTasks[j].ID] }) for _, task := range allTasks { - logrus.Debugf("Task %d: Emergency Value: %d", task.ID, p.CalculateTaskEmergency(task, taskToWorkerGroupMap, true)) + logrus.Debugf("Task %d: Emergency Value: %d", task.ID, taskEmergencies[task.ID]) } for _, task := range allTasks { @@ -427,7 +439,7 @@ func (d *DependsSchedulerAlgorithm) ScheduleNextTasks(allTasks []*structure.Task // Check if all dependencies are satisfied allDepsSatisfied := true for _, dep := range depends { - if depTask, exists := allTaskIDs[dep]; !exists || depTask.State != structure.TaskStateWaiting { + if depTask, exists := allTaskIDs[dep]; exists && depTask.State != structure.TaskStateComplete { allDepsSatisfied = false break } @@ -435,11 +447,9 @@ func (d *DependsSchedulerAlgorithm) ScheduleNextTasks(allTasks []*structure.Task if allDepsSatisfied { if group, exists := taskToWorkerGroupMap[task.ID]; exists && group != "" { // only support idle worker groups for now - for _, idleGroup := range idleWorkerGroups { - if group == idleGroup { - logrus.Debugf("Task %d is assigned to worker group %s", task.ID, group) - return []*structure.TaskInfo{task}, nil // Return the first task that can be scheduled - } + if slices.Contains(idleWorkerGroups, group) { + logrus.Debugf("Task %d is assigned to worker group %s", task.ID, group) + return []*structure.TaskInfo{task}, nil // Return the first task that can be scheduled } } } diff --git a/vermeer/apps/master/schedules/scheduler_cron_manager.go b/vermeer/apps/master/schedules/scheduler_cron_manager.go index f33ccb10e..49f331954 100644 --- a/vermeer/apps/master/schedules/scheduler_cron_manager.go +++ b/vermeer/apps/master/schedules/scheduler_cron_manager.go @@ -43,7 +43,6 @@ func (t *SchedulerCronManager) AddCronTask(taskInfo *structure.TaskInfo) error { } // add to cron tasks - t.cronTasks[taskInfo.ID] = append(t.cronTasks[taskInfo.ID], taskInfo) cronJob := cron.New() _, err := cronJob.AddFunc(taskInfo.CronExpr, func() { if taskInfo == nil { @@ -63,6 +62,7 @@ func (t *SchedulerCronManager) AddCronTask(taskInfo *structure.TaskInfo) error { logrus.Errorf("Failed to add cron job for task %d: %v", taskInfo.ID, err) return err } + t.cronTasks[taskInfo.ID] = append(t.cronTasks[taskInfo.ID], taskInfo) t.crons[taskInfo.ID] = append(t.crons[taskInfo.ID], cronJob) cronJob.Start() logrus.Infof("Added cron task for task ID %d with expression %s", taskInfo.ID, taskInfo.CronExpr) diff --git a/vermeer/apps/master/schedules/scheduler_task_manager.go b/vermeer/apps/master/schedules/scheduler_task_manager.go index 2f850f23b..f3e7f4098 100644 --- a/vermeer/apps/master/schedules/scheduler_task_manager.go +++ b/vermeer/apps/master/schedules/scheduler_task_manager.go @@ -8,6 +8,7 @@ import ( ) type SchedulerTaskManager struct { + structure.MutexLocker // This struct is responsible for managing tasks in the scheduling system. // A map from task ID to TaskInfo can be used to track tasks. allTaskMap map[int32]*structure.TaskInfo @@ -32,6 +33,8 @@ func (t *SchedulerTaskManager) QueueTask(taskInfo *structure.TaskInfo) (bool, er return false, errors.New("the property `SpaceName` of taskInfo is empty") } + defer t.Unlock(t.Lock()) + // Add the task to the task map t.allTaskMap[taskInfo.ID] = taskInfo t.allTaskQueue = append(t.allTaskQueue, taskInfo) @@ -53,6 +56,13 @@ func (t *SchedulerTaskManager) RemoveTask(taskID int32) error { return errors.New("task not found") } delete(t.allTaskMap, taskID) + // remove from queue + for i, task := range t.allTaskQueue { + if task.ID == taskID { + t.allTaskQueue = append(t.allTaskQueue[:i], t.allTaskQueue[i+1:]...) + break + } + } delete(t.taskToworkerGroupMap, taskID) return nil } diff --git a/vermeer/config/master.ini b/vermeer/config/master.ini index 72695d6ed..34f1859c9 100644 --- a/vermeer/config/master.ini +++ b/vermeer/config/master.ini @@ -14,7 +14,7 @@ ; limitations under the License. [default] -log_level=debug +log_level=info debug_mode=release http_peer=0.0.0.0:6688 grpc_peer=0.0.0.0:6689 diff --git a/vermeer/test/functional/compute_base.go b/vermeer/test/functional/compute_base.go index e745aa65e..67bad8f12 100644 --- a/vermeer/test/functional/compute_base.go +++ b/vermeer/test/functional/compute_base.go @@ -140,7 +140,7 @@ func (ctb *ComputeTaskBase) SendComputeReqAsyncBatchPriority(params []map[string //若成功启动Compute Task,开始轮询tasksGet,解析response,得到状态为完成时break。 var taskResp *client.TaskResponse var err error - for i := 0; i < ctb.waitSecond; i++ { + for attempt := 0; attempt < ctb.waitSecond; attempt++ { ctb.healthCheck.DoHealthCheck() taskResp, err = ctb.masterHttp.GetTask(ctb.taskID) require.NoError(ctb.t, err) diff --git a/vermeer/test/functional/load_local.go b/vermeer/test/functional/load_local.go index bf217cf07..52575bb1f 100644 --- a/vermeer/test/functional/load_local.go +++ b/vermeer/test/functional/load_local.go @@ -47,6 +47,8 @@ func (lt *LoadTaskLocal) TaskLoadBody() map[string]string { } } +// TaskLoadBodyWithNum creates load configuration with specified number of files. +// If num <= 10, it will be automatically adjusted to 30 to ensure minimum test coverage. func (lt *LoadTaskLocal) TaskLoadBodyWithNum(num int) map[string]string { vertexBackends := []string{"db", "mem"} diff --git a/vermeer/vermeer_test.go b/vermeer/vermeer_test.go index 0be370c07..fcc932721 100644 --- a/vermeer/vermeer_test.go +++ b/vermeer/vermeer_test.go @@ -111,9 +111,9 @@ func testScheduler(t *testing.T) { func testAlgorithms(t *testing.T) { // todo: 增加算法名称 - // var computeTasks = []string{"pagerank", "lpa", "wcc", "degree_out", "degree_in", "degree_both", "triangle_count", - // "sssp", "closeness_centrality", "betweenness_centrality", "kcore", "jaccard", "ppr", "clustering_coefficient", "scc", "louvain"} - var computeTasks = []string{"pagerank"} + var computeTasks = []string{"pagerank", "lpa", "wcc", "degree_out", "degree_in", "degree_both", "triangle_count", + "sssp", "closeness_centrality", "betweenness_centrality", "kcore", "jaccard", "ppr", "clustering_coefficient", "scc", "louvain"} + // var computeTasks = []string{"pagerank"} startTime := time.Now() expectRes, err := functional.GetExpectRes(expectResPath) From 767cad829eaecbbda368b3c1149c9cabe76a42fd Mon Sep 17 00:00:00 2001 From: ethereal Date: Mon, 22 Sep 2025 22:22:48 +0800 Subject: [PATCH 22/27] chore: change getting all task to complete/ refresh group assign --- vermeer/apps/master/bl/compute_task.go | 2 + vermeer/apps/master/bl/load_task.go | 3 ++ vermeer/apps/master/bl/scheduler_bl.go | 19 +++++++- .../schedules/scheduler_task_manager.go | 46 +++++++++++++++++-- 4 files changed, 64 insertions(+), 6 deletions(-) diff --git a/vermeer/apps/master/bl/compute_task.go b/vermeer/apps/master/bl/compute_task.go index 80ca80528..e8abef046 100644 --- a/vermeer/apps/master/bl/compute_task.go +++ b/vermeer/apps/master/bl/compute_task.go @@ -142,6 +142,8 @@ func (ctb *ComputeTaskBl) ComputeTaskStatus( } } taskMgr.ForceState(computeTask.Task, structure.TaskStateComplete) + // for scheduler, mark task complete + Scheduler.taskManager.MarkTaskComplete(taskId) graph.SubUsingNum() computeTask.FreeMemory() needQuery := options.GetInt(computeTask.Task.Params, "output.need_query") == 1 diff --git a/vermeer/apps/master/bl/load_task.go b/vermeer/apps/master/bl/load_task.go index 4e1f79f4f..80c0023bc 100644 --- a/vermeer/apps/master/bl/load_task.go +++ b/vermeer/apps/master/bl/load_task.go @@ -204,6 +204,9 @@ func (lb *LoadTaskBl) LoadTaskStatus(taskId int32, state string, workerName stri loadTask.Task.SetState(structure.TaskStateLoaded) //TaskMgr.ForceState(loadTask.Task, structure.TaskStateLoaded) + // for scheduler, mark task complete + Scheduler.taskManager.MarkTaskComplete(taskId) + logrus.Infof("graph: %s, vertex: %d, edge: %d", graph.Name, graph.VertexCount, graph.EdgeCount) for _, w := range graph.Workers { logrus.Infof( diff --git a/vermeer/apps/master/bl/scheduler_bl.go b/vermeer/apps/master/bl/scheduler_bl.go index 5582acf62..137a6401e 100644 --- a/vermeer/apps/master/bl/scheduler_bl.go +++ b/vermeer/apps/master/bl/scheduler_bl.go @@ -134,12 +134,13 @@ func (s *ScheduleBl) tryScheduleInner(softSchedule bool, noLock ...bool) error { // step 1: make sure all tasks have alloc to a worker group // This is done by the TaskManager, which assigns a worker group to each task + s.taskManager.RefreshTaskToWorkerGroupMap() // step 2: get available resources and tasks logrus.Debugf("scheduling next tasks, softSchedule: %v", softSchedule) idleWorkerGroups := s.resourceManager.GetIdleWorkerGroups() concurrentWorkerGroups := s.resourceManager.GetConcurrentWorkerGroups() - allTasks := s.taskManager.GetAllTasks() + allTasks := s.taskManager.GetAllTasksNotComplete() if len(allTasks) == 0 || (len(idleWorkerGroups) == 0 && len(concurrentWorkerGroups) == 0) { logrus.Debugf("no available tasks or workerGroups, allTasks: %d, workerGroups: %d/%d", len(allTasks), len(idleWorkerGroups), len(concurrentWorkerGroups)) @@ -147,6 +148,9 @@ func (s *ScheduleBl) tryScheduleInner(softSchedule bool, noLock ...bool) error { } logrus.Debugf("all tasks: %d, workerGroups: %d/%d", len(allTasks), len(idleWorkerGroups), len(concurrentWorkerGroups)) + // TODO: NEED TO JUDGE IF THE TASK CAN CONCURRENTLY RUNNING + // NOT only by user setting, but also by scheduler setting + // step 3: return the task with the highest priority or small tasks which can be executed immediately taskToWorkerGroupMap := s.taskManager.GetTaskToWorkerGroupMap() nextTasks, err := s.algorithmManager.ScheduleNextTasks(allTasks, taskToWorkerGroupMap, idleWorkerGroups, concurrentWorkerGroups, softSchedule) @@ -195,6 +199,19 @@ func (s *ScheduleBl) QueueTask(taskInfo *structure.TaskInfo) (bool, error) { logrus.Debugf("queuing task %d with parameters: %+v", taskInfo.ID, taskInfo) + // check dependency if exists + if len(taskInfo.Preorders) > 0 { + for _, depTaskID := range taskInfo.Preorders { + depTask := taskMgr.GetTaskByID(depTaskID) + if depTask == nil { + err := errors.New("the dependency task with ID " + strconv.Itoa(int(depTaskID)) + " does not exist") + logrus.Error(err) + taskMgr.SetError(taskInfo, err.Error()) + return false, err + } + } + } + // Notice: Ensure successful invocation. // make sure all tasks have alloc to a worker group ok, err := s.taskManager.QueueTask(taskInfo) diff --git a/vermeer/apps/master/schedules/scheduler_task_manager.go b/vermeer/apps/master/schedules/scheduler_task_manager.go index f3e7f4098..7aca94581 100644 --- a/vermeer/apps/master/schedules/scheduler_task_manager.go +++ b/vermeer/apps/master/schedules/scheduler_task_manager.go @@ -14,12 +14,15 @@ type SchedulerTaskManager struct { allTaskMap map[int32]*structure.TaskInfo allTaskQueue []*structure.TaskInfo startTaskQueue []*structure.TaskInfo + // onGoingTasks + notCompleteTasks map[int32]*structure.TaskInfo // A map from task ID to worker group can be used to track which worker group is handling which task. taskToworkerGroupMap map[int32]string } func (t *SchedulerTaskManager) Init() *SchedulerTaskManager { t.allTaskMap = make(map[int32]*structure.TaskInfo) + t.notCompleteTasks = make(map[int32]*structure.TaskInfo) t.taskToworkerGroupMap = make(map[int32]string) return t } @@ -38,10 +41,23 @@ func (t *SchedulerTaskManager) QueueTask(taskInfo *structure.TaskInfo) (bool, er // Add the task to the task map t.allTaskMap[taskInfo.ID] = taskInfo t.allTaskQueue = append(t.allTaskQueue, taskInfo) + t.notCompleteTasks[taskInfo.ID] = taskInfo t.AssignGroup(taskInfo) return true, nil } +func (t *SchedulerTaskManager) RefreshTaskToWorkerGroupMap() { + defer t.Unlock(t.Lock()) + + for _, taskInfo := range t.GetAllTasksNotComplete() { + if taskInfo == nil { + continue + } + t.AssignGroup(taskInfo) + t.taskToworkerGroupMap[taskInfo.ID] = workerMgr.ApplyGroup(taskInfo.SpaceName, taskInfo.GraphName) + } +} + // Only for debug or test, get task start sequence func (t *SchedulerTaskManager) AddTaskStartSequence(taskID int32) error { if _, exists := t.allTaskMap[taskID]; !exists { @@ -64,6 +80,15 @@ func (t *SchedulerTaskManager) RemoveTask(taskID int32) error { } } delete(t.taskToworkerGroupMap, taskID) + delete(t.notCompleteTasks, taskID) + return nil +} + +func (t *SchedulerTaskManager) MarkTaskComplete(taskID int32) error { + if _, exists := t.allTaskMap[taskID]; !exists { + return errors.New("task not found") + } + delete(t.notCompleteTasks, taskID) return nil } @@ -106,9 +131,17 @@ func (t *SchedulerTaskManager) GetAllTasks() []*structure.TaskInfo { return tasks } +func (t *SchedulerTaskManager) GetAllTasksNotComplete() []*structure.TaskInfo { + tasks := make([]*structure.TaskInfo, 0, len(t.allTaskMap)) + for _, task := range t.notCompleteTasks { + tasks = append(tasks, task) + } + return tasks +} + func (t *SchedulerTaskManager) GetAllTasksWaitng() []*structure.TaskInfo { tasks := make([]*structure.TaskInfo, 0, len(t.allTaskMap)) - for _, task := range t.allTaskMap { + for _, task := range t.GetAllTasksNotComplete() { if task.State == structure.TaskStateWaiting { tasks = append(tasks, task) } @@ -118,7 +151,7 @@ func (t *SchedulerTaskManager) GetAllTasksWaitng() []*structure.TaskInfo { func (t *SchedulerTaskManager) GetTasksInQueue(space string) []*structure.TaskInfo { tasks := make([]*structure.TaskInfo, 0) - for _, task := range t.allTaskQueue { + for _, task := range t.GetAllTasksNotComplete() { if task.SpaceName == space { tasks = append(tasks, task) } @@ -153,9 +186,12 @@ func (t *SchedulerTaskManager) GetTaskStartSequence(queryTasks []int32) []*struc func (t *SchedulerTaskManager) GetTaskToWorkerGroupMap() map[int32]string { // Return a copy of the worker group map to avoid external modifications - groupMap := make(map[int32]string, len(t.taskToworkerGroupMap)) - for k, v := range t.taskToworkerGroupMap { - groupMap[k] = v + taskNotComplete := t.GetAllTasksNotComplete() + groupMap := make(map[int32]string, len(taskNotComplete)) + for _, task := range taskNotComplete { + if group, exists := t.taskToworkerGroupMap[task.ID]; exists { + groupMap[task.ID] = group + } } return groupMap } From c00bd801973dfd97d3762609118ffb65f7c88c4a Mon Sep 17 00:00:00 2001 From: ethereal Date: Thu, 25 Sep 2025 16:03:45 +0800 Subject: [PATCH 23/27] chore: add some corner test --- vermeer/apps/master/bl/task_bl.go | 16 +++- vermeer/test/functional/compute_base.go | 13 +++ vermeer/test/functional/compute_task.go | 1 + vermeer/test/functional/http_interface.go | 10 +++ vermeer/test/scheduler/priority.go | 100 ++++++++++++++++++++++ 5 files changed, 138 insertions(+), 2 deletions(-) diff --git a/vermeer/apps/master/bl/task_bl.go b/vermeer/apps/master/bl/task_bl.go index 373b0964c..49724fba6 100644 --- a/vermeer/apps/master/bl/task_bl.go +++ b/vermeer/apps/master/bl/task_bl.go @@ -67,23 +67,30 @@ func (tb *TaskBl) CreateTaskInfo( // for scheduler taskInfo.Priority = 0 taskInfo.Preorders = make([]int32, 0) - taskInfo.Exclusive = false // default to false, can be set to true if needed - // taskInfo.Exclusive = true + taskInfo.Exclusive = true // default to true for now, can be set false by params if params != nil { if priority, ok := params["priority"]; ok { if p, err := strconv.ParseInt(priority, 10, 32); err == nil { + if p < 0 { + return nil, fmt.Errorf("priority should be non-negative") + } taskInfo.Priority = int32(p) } else { logrus.Warnf("priority convert to int32 error:%v", err) + return nil, err } } if preorders, ok := params["preorders"]; ok { preorderList := strings.Split(preorders, ",") for _, preorder := range preorderList { if pid, err := strconv.ParseInt(preorder, 10, 32); err == nil { + if taskMgr.GetTaskByID(int32(pid)) == nil { + return nil, fmt.Errorf("preorder task id %d not exists", pid) + } taskInfo.Preorders = append(taskInfo.Preorders, int32(pid)) } else { logrus.Warnf("preorder convert to int32 error:%v", err) + return nil, err } } } @@ -92,9 +99,14 @@ func (tb *TaskBl) CreateTaskInfo( taskInfo.Exclusive = ex } else { logrus.Warnf("exclusive convert to bool error:%v", err) + return nil, err } } if cronExpr, ok := params["cron_expr"]; ok { + if err := Scheduler.cronManager.CheckCronExpression(cronExpr); err != nil { + logrus.Warnf("cron_expr parse error:%v", err) + return nil, err + } taskInfo.CronExpr = cronExpr } } diff --git a/vermeer/test/functional/compute_base.go b/vermeer/test/functional/compute_base.go index 67bad8f12..344e03b72 100644 --- a/vermeer/test/functional/compute_base.go +++ b/vermeer/test/functional/compute_base.go @@ -108,6 +108,19 @@ func (ctb *ComputeTaskBase) SendComputeReqAsyncNotWait(params map[string]string) return int32(resp.Task.ID) } +func (ctb *ComputeTaskBase) SendComputeReqAsyncNotWaitWithError(params map[string]string) (int32, error) { + //create Compute Task + resp, err := ctb.masterHttp.CreateTaskAsync(client.TaskCreateRequest{ + TaskType: "compute", + GraphName: ctb.graphName, + Params: params, + }) + if err != nil { + return -1, err + } + return int32(resp.Task.ID), nil +} + func (ctb *ComputeTaskBase) SendComputeReqAsyncBatchPriority(params []map[string]string) ([]int32, []int32) { //create Compute Task tasks := make([]client.TaskInfo, 0, len(params)) diff --git a/vermeer/test/functional/compute_task.go b/vermeer/test/functional/compute_task.go index 9373fe153..08d65fdcb 100644 --- a/vermeer/test/functional/compute_task.go +++ b/vermeer/test/functional/compute_task.go @@ -37,6 +37,7 @@ type ComputeTask interface { TaskComputeBody() map[string]string SendComputeReqAsync(params map[string]string) SendComputeReqAsyncNotWait(params map[string]string) int32 + SendComputeReqAsyncNotWaitWithError(params map[string]string) (int32, error) SendComputeReqAsyncBatchPriority(params []map[string]string) ([]int32, []int32) SendComputeReqSync(params map[string]string) LoadComputeRes() ([]interface{}, error) diff --git a/vermeer/test/functional/http_interface.go b/vermeer/test/functional/http_interface.go index 2869db146..30b5ecb81 100644 --- a/vermeer/test/functional/http_interface.go +++ b/vermeer/test/functional/http_interface.go @@ -76,6 +76,16 @@ func (ct CancelTask) CancelTask(t *testing.T, master *client.VermeerClient, grap require.Equal(t, "canceled", task.Task.Status) } +func (ct CancelTask) DirectCancelTask(t *testing.T, master *client.VermeerClient, taskID int32) { + ok, err := master.GetTaskCancel(int(taskID)) + require.NoError(t, err) + require.Equal(t, true, ok) + + task, err := master.GetTask(int(taskID)) + require.NoError(t, err) + require.Equal(t, "canceled", task.Task.Status) +} + type GetGraphs struct { } diff --git a/vermeer/test/scheduler/priority.go b/vermeer/test/scheduler/priority.go index 439eb6b93..63256139e 100644 --- a/vermeer/test/scheduler/priority.go +++ b/vermeer/test/scheduler/priority.go @@ -2,9 +2,11 @@ package scheduler import ( "fmt" + "sync" "testing" "time" + "vermeer/apps/structure" "vermeer/client" "vermeer/test/functional" @@ -143,6 +145,100 @@ func SubTestDepends(t *testing.T, expectRes *functional.ExpectRes, healthCheck * fmt.Printf("Test Depends: %-30s [OK], cost: %v\n", computeTask, time.Since(bTime)) } +// SubTestInvalidDependency 测试当任务依赖一个不存在的任务ID时,调度器的行为。 +// 调度器应该拒绝此任务,并返回一个错误。 +func SubTestInvalidDependency(t *testing.T, expectRes *functional.ExpectRes, healthCheck *functional.HealthCheck, masterHttp *client.VermeerClient, graphName []string, computeTask string, waitSecond int) { + fmt.Printf("Test Invalid Dependency start with task: %s\n", computeTask) + bTime := time.Now() + + computeTest, err := functional.MakeComputeTask(computeTask) + require.NoError(t, err) + computeTest.Init(graphName[0], computeTask, expectRes, waitSecond, masterHttp, t, healthCheck) + + taskBody := computeTest.TaskComputeBody() + // 设置 preorders 为一个非常大的、理论上不存在的任务ID + invalidTaskID := 999999999 + taskBody["preorders"] = fmt.Sprintf("%d", invalidTaskID) + + logrus.Infof("Attempting to submit a task with invalid dependency on ID: %d", invalidTaskID) + + // 尝试异步提交任务,并检查是否返回了错误 + taskID, err := computeTest.SendComputeReqAsyncNotWaitWithError(taskBody) + + // 断言提交操作失败 + require.Error(t, err, "Submitting a task with a non-existent dependency should return an error.") + // 断言返回的任务ID为0,或者其他表示失败的值 + require.Equal(t, int32(-1), taskID, "The task ID should be zero or invalid on failure.") + + fmt.Printf("Test Invalid Dependency: %-30s [OK], cost: %v\n", computeTask, time.Since(bTime)) +} + +func SubTestConcurrentCancellation(t *testing.T, expectRes *functional.ExpectRes, healthCheck *functional.HealthCheck, masterHttp *client.VermeerClient, graphName []string, computeTask string, waitSecond int) { + fmt.Printf("Test Concurrent Cancellation start with task: %s\n", computeTask) + bTime := time.Now() + + computeTest, err := functional.MakeComputeTask(computeTask) + require.NoError(t, err) + computeTest.Init(graphName[0], computeTask, expectRes, waitSecond, masterHttp, t, healthCheck) + + // 设置任务数量 + const numTasks = 20 + taskBodies := make([]map[string]string, numTasks) + for i := 0; i < numTasks; i++ { + taskBodies[i] = computeTest.TaskComputeBody() + } + + taskIDs := make(chan int32, numTasks) + var wg sync.WaitGroup + + // 1. 并发提交任务 + for i := 0; i < numTasks; i++ { + wg.Add(1) + go func(body map[string]string) { + defer wg.Done() + taskID := computeTest.SendComputeReqAsyncNotWait(body) + if taskID != 0 { + taskIDs <- taskID + } else { + logrus.Errorf("Failed to submit task: %v", err) + } + }(taskBodies[i]) + } + + wg.Wait() + close(taskIDs) + + submittedTaskIDs := make([]int32, 0, numTasks) + for id := range taskIDs { + submittedTaskIDs = append(submittedTaskIDs, id) + } + + logrus.Infof("Submitted %d tasks concurrently: %+v", len(submittedTaskIDs), submittedTaskIDs) + require.Equal(t, numTasks, len(submittedTaskIDs), "Not all tasks were successfully submitted.") + + cancelTask := functional.CancelTask{} + cancelTask.DirectCancelTask(t, masterHttp, submittedTaskIDs[len(submittedTaskIDs)-1]) + + // 3. 验证任务状态 + // 这里需要一个循环来检查所有任务的最终状态 + // 实际实现中,您可能需要根据调度器的API来轮询任务状态 + // 在这个示例中,我们只做基本的断言,因为没有实际的取消和状态查询逻辑 + logrus.Info("Waiting for tasks to settle...") + time.Sleep(time.Duration(waitSecond) * time.Second) + + checkTask, err := masterHttp.GetTask(int(submittedTaskIDs[numTasks-1])) + + require.NoError(t, err, "Error fetching task status after cancellation.") + require.NotNil(t, checkTask, "Task should exist after cancellation.") + + if structure.TaskState(checkTask.Task.Status) != structure.TaskStateCanceled { + logrus.Warn("No tasks were cancelled; check scheduler behavior.") + require.Fail(t, "Expected at least some tasks to be cancelled.") + } + + fmt.Printf("Test Concurrent Cancellation: %-30s [OK], cost: %v\n", computeTask, time.Since(bTime)) +} + func TestPriority(t *testing.T, expectRes *functional.ExpectRes, healthCheck *functional.HealthCheck, masterHttp *client.VermeerClient, graphName []string, factor string, waitSecond int) { fmt.Print("start test priority\n") @@ -176,4 +272,8 @@ func TestPriority(t *testing.T, expectRes *functional.ExpectRes, healthCheck *fu // 6. send tasks to different graphs // expect: the tasks should be executed concurrently // have been tested in SubTestSmall and SubTestDepends + + SubTestInvalidDependency(t, expectRes, healthCheck, masterHttp, graphName, computeTask, waitSecond) + + SubTestConcurrentCancellation(t, expectRes, healthCheck, masterHttp, graphName, computeTask, 3) } From dad0b391ea5cda0e9ab6c219caabd8a09ab83d78 Mon Sep 17 00:00:00 2001 From: ethereal Date: Fri, 26 Sep 2025 11:12:11 +0800 Subject: [PATCH 24/27] chore: add some comments --- vermeer/apps/master/bl/scheduler_bl.go | 50 +++++++++- .../schedules/scheduler_algorithm_manager.go | 48 ++++++++++ .../schedules/scheduler_cron_manager.go | 35 +++++++ .../schedules/scheduler_resource_manager.go | 46 +++++++++ .../schedules/scheduler_task_manager.go | 65 ++++++++++++- vermeer/test/functional/compute_base.go | 16 ++++ vermeer/test/functional/http_interface.go | 6 ++ vermeer/test/scheduler/batch.go | 10 ++ vermeer/test/scheduler/priority.go | 94 ++++++++++++++++--- vermeer/test/scheduler/routine.go | 10 ++ vermeer/test/scheduler/test_scheduler.go | 10 ++ 11 files changed, 374 insertions(+), 16 deletions(-) diff --git a/vermeer/apps/master/bl/scheduler_bl.go b/vermeer/apps/master/bl/scheduler_bl.go index 137a6401e..b29fd700e 100644 --- a/vermeer/apps/master/bl/scheduler_bl.go +++ b/vermeer/apps/master/bl/scheduler_bl.go @@ -28,6 +28,10 @@ import ( "github.com/sirupsen/logrus" ) +/* +* @Description: ScheduleBl is the scheduler business logic. +* @Note: This is the main scheduler business logic. + */ type ScheduleBl struct { structure.MutexLocker // resource management @@ -46,6 +50,10 @@ type ScheduleBl struct { softSchedule bool } +/* +* @Description: Init initializes the ScheduleBl. +* @Note: This function will initialize the ScheduleBl. + */ func (s *ScheduleBl) Init() { logrus.Info("Initializing ScheduleBl...") s.LoadConfig() @@ -64,6 +72,10 @@ func (s *ScheduleBl) Init() { go s.waitingStartedTask() } +/* +* @Description: LoadConfig loads the configuration from the common package. +* @Note: This function will load the configuration from the common package. + */ func (s *ScheduleBl) LoadConfig() { // Load configuration from common package @@ -102,6 +114,10 @@ func (s *ScheduleBl) LoadConfig() { s.startChanSize, s.tickerInterval, s.softSchedule) } +/* +* @Description: startTicker starts the ticker. +* @Note: This function will start the ticker. + */ func (s *ScheduleBl) startTicker() { // Create a ticker with the specified interval ticker := time.Tick(time.Duration(s.tickerInterval) * time.Second) @@ -113,6 +129,11 @@ func (s *ScheduleBl) startTicker() { } // this make scheduler manager try to schedule next tasks +/* +* @Description: TryScheduleNextTasks tries to schedule the next tasks. +* @Note: This function will try to schedule the next tasks. +* @Param noLock + */ func (s *ScheduleBl) TryScheduleNextTasks(noLock ...bool) { defer func() { if err := recover(); err != nil { @@ -126,6 +147,12 @@ func (s *ScheduleBl) TryScheduleNextTasks(noLock ...bool) { } // Main routine to schedule tasks +/* +* @Description: tryScheduleInner tries to schedule the next tasks. +* @Note: This function will try to schedule the next tasks. +* @Param softSchedule +* @Param noLock + */ func (s *ScheduleBl) tryScheduleInner(softSchedule bool, noLock ...bool) error { // Implement logic to get the next task in the queue for the given space if !(len(noLock) > 0 && noLock[0]) { @@ -183,6 +210,12 @@ func (s *ScheduleBl) tryScheduleInner(softSchedule bool, noLock ...bool) error { // QueueTask Add the task to the inner queue. // If the task exists, return false. +/* +* @Description: QueueTask queues the task. +* @Note: This function will queue the task. +* @Param taskInfo +* @Return bool, error + */ func (s *ScheduleBl) QueueTask(taskInfo *structure.TaskInfo) (bool, error) { if taskInfo == nil { return false, errors.New("the argument `taskInfo` is nil") @@ -231,6 +264,12 @@ func (s *ScheduleBl) QueueTask(taskInfo *structure.TaskInfo) (bool, error) { return ok, nil } +/* +* @Description: QueueTaskFromTemplate queues the task from the template. +* @Note: This function will queue the task from the template. This function is used by cron tasks. +* @Param template +* @Return int32, error + */ func (s *ScheduleBl) QueueTaskFromTemplate(template *structure.TaskInfo) (int32, error) { if template == nil { return -1, errors.New("the argument `template` is nil") @@ -255,6 +294,12 @@ func (s *ScheduleBl) QueueTaskFromTemplate(template *structure.TaskInfo) (int32, return taskInfo.ID, nil } +/* +* @Description: BatchQueueTask batches the task. +* @Note: This function will batch the task. +* @Param taskInfos +* @Return []bool, []error + */ func (s *ScheduleBl) BatchQueueTask(taskInfos []*structure.TaskInfo) ([]bool, []error) { if len(taskInfos) == 0 { return []bool{}, []error{} @@ -281,7 +326,6 @@ func (s *ScheduleBl) BatchQueueTask(taskInfos []*structure.TaskInfo) ([]bool, [] } // ******** CloseCurrent ******** - func (s *ScheduleBl) CloseCurrent(taskId int32, removeWorkerName ...string) error { defer s.Unlock(s.Lock()) @@ -307,6 +351,8 @@ func (s *ScheduleBl) CloseCurrent(taskId int32, removeWorkerName ...string) erro return nil } +// This will be called when a worker is offline. +// This will be called when a worker is online. func (s *ScheduleBl) ChangeWorkerStatus(workerName string, status schedules.WorkerOngoingStatus) (bool, error) { defer s.Unlock(s.Lock()) s.resourceManager.ChangeWorkerStatus(workerName, status) @@ -406,7 +452,7 @@ func (s *ScheduleBl) startWaitingTask(agent *schedules.Agent, taskInfo *structur // ********* CANCEL TASK ******** // handle cancel task - +// need to cancel cron task func (s *ScheduleBl) CancelTask(taskInfo *structure.TaskInfo) error { if taskInfo == nil { return errors.New("the argument `taskInfo` is nil") diff --git a/vermeer/apps/master/schedules/scheduler_algorithm_manager.go b/vermeer/apps/master/schedules/scheduler_algorithm_manager.go index c92485b33..1d50a4509 100644 --- a/vermeer/apps/master/schedules/scheduler_algorithm_manager.go +++ b/vermeer/apps/master/schedules/scheduler_algorithm_manager.go @@ -11,6 +11,10 @@ import ( "github.com/sirupsen/logrus" ) +/* +* @Description: SchedulerAlgorithm is the interface for the scheduler algorithm. +* @Note: This is the interface for the scheduler algorithm. + */ type SchedulerAlgorithm interface { // Name returns the name of the SchedulerAlgorithm Name() string @@ -22,12 +26,21 @@ type SchedulerAlgorithm interface { ScheduleNextTasks(filteredTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkerGroups []string, concurrentWorkerGroups []string, softSchedule bool) ([]*structure.TaskInfo, error) } +/* +* @Description: SchedulerAlgorithmManager is the manager for the scheduler algorithm. +* @Note: This is the manager for the scheduler algorithm. + */ type SchedulerAlgorithmManager struct { filteredSchedulerAlgorithms map[string]SchedulerAlgorithm scheduledSchedulerAlgorithms map[string]SchedulerAlgorithm dispatchPaused bool } +/* +* @Description: Init initializes the SchedulerAlgorithmManager. +* @Note: This function will initialize the SchedulerAlgorithmManager. + */ +// Need to put DependsSchedulerAlgorithm before WaitingSchedulerAlgorithm func (am *SchedulerAlgorithmManager) Init() { am.filteredSchedulerAlgorithms = make(map[string]SchedulerAlgorithm) am.scheduledSchedulerAlgorithms = make(map[string]SchedulerAlgorithm) @@ -39,6 +52,11 @@ func (am *SchedulerAlgorithmManager) Init() { am.RegisterSchedulerAlgorithm(&PriorityElderSchedulerAlgorithm{}) } +/* +* @Description: RegisterSchedulerAlgorithm registers the scheduler algorithm. +* @Note: This function will register the scheduler algorithm. +* @Param schedulerAlgorithm + */ func (am *SchedulerAlgorithmManager) RegisterSchedulerAlgorithm(schedulerAlgorithm SchedulerAlgorithm) { if schedulerAlgorithm == nil { return @@ -56,6 +74,11 @@ func (am *SchedulerAlgorithmManager) RegisterSchedulerAlgorithm(schedulerAlgorit am.scheduledSchedulerAlgorithms[name] = schedulerAlgorithm } +/* +* @Description: RegisterFilterAlgorithm registers the filter algorithm. +* @Note: This function will register the filter algorithm. +* @Param filterAlgorithm + */ func (am *SchedulerAlgorithmManager) RegisterFilterAlgorithm(filterAlgorithm SchedulerAlgorithm) { if filterAlgorithm == nil { return @@ -68,18 +91,43 @@ func (am *SchedulerAlgorithmManager) RegisterFilterAlgorithm(filterAlgorithm Sch am.filteredSchedulerAlgorithms[name] = filterAlgorithm } +/* +* @Description: IsDispatchPaused checks if the dispatch is paused. +* @Note: This function will check if the dispatch is paused. +* @Return bool + */ func (am *SchedulerAlgorithmManager) IsDispatchPaused() bool { return am.dispatchPaused } +/* +* @Description: PauseDispatch pauses the dispatch. +* @Note: This function will pause the dispatch. + */ func (am *SchedulerAlgorithmManager) PauseDispatch() { am.dispatchPaused = true } +/* +* @Description: ResumeDispatch resumes the dispatch. +* @Note: This function will resume the dispatch. + */ func (am *SchedulerAlgorithmManager) ResumeDispatch() { am.dispatchPaused = false } +/* +* @Description: ScheduleNextTasks schedules the next tasks. +* @Note: This function will schedule the next tasks. +* @Param allTasks +* @Param taskToWorkerGroupMap +* @Param idleWorkerGroups +* @Param concurrentWorkerGroups +* @Param softSchedule +* @Return []*structure.TaskInfo, error + */ +// For all tasks, filter and schedule them +// Only one scheduling algorithm is supported for now func (am *SchedulerAlgorithmManager) ScheduleNextTasks(allTasks []*structure.TaskInfo, taskToWorkerGroupMap map[int32]string, idleWorkerGroups []string, concurrentWorkerGroups []string, softSchedule bool) ([]*structure.TaskInfo, error) { if am.dispatchPaused { return nil, nil // No tasks to schedule if dispatch is paused diff --git a/vermeer/apps/master/schedules/scheduler_cron_manager.go b/vermeer/apps/master/schedules/scheduler_cron_manager.go index 49f331954..12ad5dbd7 100644 --- a/vermeer/apps/master/schedules/scheduler_cron_manager.go +++ b/vermeer/apps/master/schedules/scheduler_cron_manager.go @@ -8,6 +8,10 @@ import ( "github.com/sirupsen/logrus" ) +/* +* @Description: SchedulerCronManager is the manager for the scheduler cron. +* @Note: This is the manager for the scheduler cron. + */ type SchedulerCronManager struct { cronTasks map[int32][]*structure.TaskInfo // cron expression to TaskInfo. Origin task ID to copied tasks crons map[int32][]*cron.Cron // cron expression to cron jobs @@ -15,6 +19,12 @@ type SchedulerCronManager struct { queueTemplateHandler func(*structure.TaskInfo) (int32, error) } +/* +* @Description: Init initializes the SchedulerCronManager. +* @Note: This function will initialize the SchedulerCronManager. +* @Param queueTemplateHandler +* @Return *SchedulerCronManager + */ func (t *SchedulerCronManager) Init(queueTemplateHandler func(*structure.TaskInfo) (int32, error)) *SchedulerCronManager { t.cronTasks = make(map[int32][]*structure.TaskInfo) t.crons = make(map[int32][]*cron.Cron) @@ -22,6 +32,12 @@ func (t *SchedulerCronManager) Init(queueTemplateHandler func(*structure.TaskInf return t } +/* +* @Description: CheckCronExpression checks the cron expression. +* @Note: This function will check the cron expression. +* @Param cronExpr +* @Return error + */ func (t *SchedulerCronManager) CheckCronExpression(cronExpr string) error { if cronExpr == "" { return errors.New("cron expression is empty") @@ -33,6 +49,12 @@ func (t *SchedulerCronManager) CheckCronExpression(cronExpr string) error { return nil } +/* +* @Description: AddCronTask adds the cron task. +* @Note: This function will add the cron task. +* @Param taskInfo +* @Return error + */ func (t *SchedulerCronManager) AddCronTask(taskInfo *structure.TaskInfo) error { if taskInfo == nil { return errors.New("the argument `taskInfo` is nil") @@ -69,6 +91,12 @@ func (t *SchedulerCronManager) AddCronTask(taskInfo *structure.TaskInfo) error { return nil } +/* +* @Description: DeleteTask deletes the cron task. +* @Note: This function will delete the cron task. +* @Param taskID +* @Return error + */ func (t *SchedulerCronManager) DeleteTask(taskID int32) error { if _, exists := t.cronTasks[taskID]; !exists { return errors.New("task not found in cron tasks") @@ -83,6 +111,13 @@ func (t *SchedulerCronManager) DeleteTask(taskID int32) error { return nil } +/* +* @Description: DeleteTaskByGraph deletes the cron task by graph. +* @Note: This function will delete the cron task by graph. +* @Param spaceName +* @Param graphName +* @Return error + */ func (t *SchedulerCronManager) DeleteTaskByGraph(spaceName, graphName string) error { if spaceName == "" || graphName == "" { return errors.New("the argument `spaceName` or `graphName` is empty") diff --git a/vermeer/apps/master/schedules/scheduler_resource_manager.go b/vermeer/apps/master/schedules/scheduler_resource_manager.go index 35f03a342..f74b4c591 100644 --- a/vermeer/apps/master/schedules/scheduler_resource_manager.go +++ b/vermeer/apps/master/schedules/scheduler_resource_manager.go @@ -7,6 +7,10 @@ import ( "github.com/sirupsen/logrus" ) +/* +* @Description: WorkerOngoingStatus is the status of the worker ongoing. +* @Note: This is the status of the worker ongoing. + */ type WorkerOngoingStatus string const ( @@ -17,6 +21,10 @@ const ( WorkerOngoingStatusDeleted WorkerOngoingStatus = "deleted" ) +/* +* @Description: SchedulerResourceManager is the manager for the scheduler resource. +* @Note: This is the manager for the scheduler resource. + */ type SchedulerResourceManager struct { structure.MutexLocker workerStatus map[string]WorkerOngoingStatus @@ -27,6 +35,10 @@ type SchedulerResourceManager struct { broker *Broker } +/* +* @Description: Init initializes the SchedulerResourceManager. +* @Note: This function will initialize the SchedulerResourceManager. + */ func (rm *SchedulerResourceManager) Init() { rm.workerStatus = make(map[string]WorkerOngoingStatus) rm.workerGroupStatus = make(map[string]WorkerOngoingStatus) @@ -34,6 +46,11 @@ func (rm *SchedulerResourceManager) Init() { rm.broker = new(Broker).Init() } +/* +* @Description: ReleaseByTaskID releases the resource by task ID. +* @Note: This function will release the resource by task ID. +* @Param taskID + */ func (rm *SchedulerResourceManager) ReleaseByTaskID(taskID int32) { defer rm.Unlock(rm.Lock()) @@ -64,6 +81,13 @@ func (rm *SchedulerResourceManager) ReleaseByTaskID(taskID int32) { } } +/* +* @Description: isTaskRunningOnWorkerGroup checks if the task is running on the worker group. +* @Note: This function will check if the task is running on the worker group. +* @Param workerGroup +* @Param taskID +* @Return bool + */ func (rm *SchedulerResourceManager) isTaskRunningOnWorkerGroup(workerGroup string, taskID int32) bool { if tasks, exists := rm.runningWorkerGroupTasks[workerGroup]; exists { for _, id := range tasks { @@ -75,6 +99,12 @@ func (rm *SchedulerResourceManager) isTaskRunningOnWorkerGroup(workerGroup strin return false } +/* +* @Description: GetAgentAndAssignTask gets the agent and assigns the task. +* @Note: This function will get the agent and assigns the task. +* @Param taskInfo +* @Return *Agent, AgentStatus, error + */ func (rm *SchedulerResourceManager) GetAgentAndAssignTask(taskInfo *structure.TaskInfo) (*Agent, AgentStatus, error) { if taskInfo == nil { return nil, AgentStatusError, errors.New("taskInfo is nil") @@ -115,6 +145,11 @@ func (rm *SchedulerResourceManager) GetAgentAndAssignTask(taskInfo *structure.Ta return agent, status, nil } +/* +* @Description: GetIdleWorkerGroups gets the idle worker groups. +* @Note: This function will get the idle worker groups. +* @Return []string + */ func (rm *SchedulerResourceManager) GetIdleWorkerGroups() []string { defer rm.Unlock(rm.Lock()) @@ -127,6 +162,11 @@ func (rm *SchedulerResourceManager) GetIdleWorkerGroups() []string { return idleWorkerGroups } +/* +* @Description: GetConcurrentWorkerGroups gets the concurrent worker groups. +* @Note: This function will get the concurrent worker groups. +* @Return []string + */ func (rm *SchedulerResourceManager) GetConcurrentWorkerGroups() []string { defer rm.Unlock(rm.Lock()) @@ -139,6 +179,12 @@ func (rm *SchedulerResourceManager) GetConcurrentWorkerGroups() []string { return concurrentWorkerGroups } +/* +* @Description: changeWorkerStatus changes the worker status. +* @Note: This function will change the worker status. +* @Param workerName +* @Param status + */ func (rm *SchedulerResourceManager) changeWorkerStatus(workerName string, status WorkerOngoingStatus) { rm.workerStatus[workerName] = status diff --git a/vermeer/apps/master/schedules/scheduler_task_manager.go b/vermeer/apps/master/schedules/scheduler_task_manager.go index 7aca94581..2b331528f 100644 --- a/vermeer/apps/master/schedules/scheduler_task_manager.go +++ b/vermeer/apps/master/schedules/scheduler_task_manager.go @@ -7,12 +7,17 @@ import ( "github.com/sirupsen/logrus" ) +/* +* @Description: SchedulerTaskManager is the manager for the scheduler task. +* @Note: This is the manager for the scheduler task. + */ type SchedulerTaskManager struct { structure.MutexLocker // This struct is responsible for managing tasks in the scheduling system. // A map from task ID to TaskInfo can be used to track tasks. - allTaskMap map[int32]*structure.TaskInfo - allTaskQueue []*structure.TaskInfo + allTaskMap map[int32]*structure.TaskInfo + allTaskQueue []*structure.TaskInfo + // For debug or test, get task start sequence startTaskQueue []*structure.TaskInfo // onGoingTasks notCompleteTasks map[int32]*structure.TaskInfo @@ -20,6 +25,11 @@ type SchedulerTaskManager struct { taskToworkerGroupMap map[int32]string } +/* +* @Description: Init initializes the SchedulerTaskManager. +* @Note: This function will initialize the SchedulerTaskManager. +* @Return *SchedulerTaskManager + */ func (t *SchedulerTaskManager) Init() *SchedulerTaskManager { t.allTaskMap = make(map[int32]*structure.TaskInfo) t.notCompleteTasks = make(map[int32]*structure.TaskInfo) @@ -27,6 +37,12 @@ func (t *SchedulerTaskManager) Init() *SchedulerTaskManager { return t } +/* +* @Description: QueueTask queues the task. +* @Note: This function will queue the task. +* @Param taskInfo +* @Return bool, error + */ func (t *SchedulerTaskManager) QueueTask(taskInfo *structure.TaskInfo) (bool, error) { if taskInfo == nil { return false, errors.New("the argument `taskInfo` is nil") @@ -46,6 +62,10 @@ func (t *SchedulerTaskManager) QueueTask(taskInfo *structure.TaskInfo) (bool, er return true, nil } +/* +* @Description: RefreshTaskToWorkerGroupMap refreshes the task to worker group map. +* @Note: This function will refresh the task to worker group map. + */ func (t *SchedulerTaskManager) RefreshTaskToWorkerGroupMap() { defer t.Unlock(t.Lock()) @@ -59,6 +79,12 @@ func (t *SchedulerTaskManager) RefreshTaskToWorkerGroupMap() { } // Only for debug or test, get task start sequence +/* +* @Description: AddTaskStartSequence adds the task start sequence. +* @Note: This function will add the task start sequence. +* @Param taskID +* @Return error + */ func (t *SchedulerTaskManager) AddTaskStartSequence(taskID int32) error { if _, exists := t.allTaskMap[taskID]; !exists { return errors.New("task not found") @@ -67,6 +93,12 @@ func (t *SchedulerTaskManager) AddTaskStartSequence(taskID int32) error { return nil } +/* +* @Description: RemoveTask removes the task. +* @Note: This function will remove the task. +* @Param taskID +* @Return error + */ func (t *SchedulerTaskManager) RemoveTask(taskID int32) error { if _, exists := t.allTaskMap[taskID]; !exists { return errors.New("task not found") @@ -84,6 +116,12 @@ func (t *SchedulerTaskManager) RemoveTask(taskID int32) error { return nil } +/* +* @Description: MarkTaskComplete marks the task complete. +* @Note: This function will mark the task complete. +* @Param taskID +* @Return error + */ func (t *SchedulerTaskManager) MarkTaskComplete(taskID int32) error { if _, exists := t.allTaskMap[taskID]; !exists { return errors.New("task not found") @@ -93,6 +131,12 @@ func (t *SchedulerTaskManager) MarkTaskComplete(taskID int32) error { } // update or create a task in the task map +/* +* @Description: AssignGroup assigns the group. +* @Note: This function will assign the group. +* @Param taskInfo +* @Return error + */ func (t *SchedulerTaskManager) AssignGroup(taskInfo *structure.TaskInfo) error { group := workerMgr.ApplyGroup(taskInfo.SpaceName, taskInfo.GraphName) if group == "" { @@ -102,6 +146,12 @@ func (t *SchedulerTaskManager) AssignGroup(taskInfo *structure.TaskInfo) error { return nil } +/* +* @Description: GetTaskByID gets the task by ID. +* @Note: This function will get the task by ID. +* @Param taskID +* @Return *structure.TaskInfo, error + */ func (t *SchedulerTaskManager) GetTaskByID(taskID int32) (*structure.TaskInfo, error) { task, exists := t.allTaskMap[taskID] if !exists { @@ -110,6 +160,12 @@ func (t *SchedulerTaskManager) GetTaskByID(taskID int32) (*structure.TaskInfo, e return task, nil } +/* +* @Description: GetLastTask gets the last task. +* @Note: This function will get the last task. +* @Param spaceName +* @Return *structure.TaskInfo + */ func (t *SchedulerTaskManager) GetLastTask(spaceName string) *structure.TaskInfo { // Implement logic to get the last task in the queue for the given space if len(t.allTaskQueue) == 0 { @@ -123,6 +179,11 @@ func (t *SchedulerTaskManager) GetLastTask(spaceName string) *structure.TaskInfo return nil } +/* +* @Description: GetAllTasks gets all tasks. +* @Note: This function will get all tasks. +* @Return []*structure.TaskInfo + */ func (t *SchedulerTaskManager) GetAllTasks() []*structure.TaskInfo { tasks := make([]*structure.TaskInfo, 0, len(t.allTaskMap)) for _, task := range t.allTaskMap { diff --git a/vermeer/test/functional/compute_base.go b/vermeer/test/functional/compute_base.go index 344e03b72..745fd7a57 100644 --- a/vermeer/test/functional/compute_base.go +++ b/vermeer/test/functional/compute_base.go @@ -97,6 +97,11 @@ func (ctb *ComputeTaskBase) SendComputeReqAsync(params map[string]string) { require.Equal(ctb.t, "complete", taskResp.Task.Status) } +/* +* @Description: SendComputeReqAsyncNotWait sends a compute request asynchronously and returns the task ID. +* @Param params +* @Return int32 + */ func (ctb *ComputeTaskBase) SendComputeReqAsyncNotWait(params map[string]string) int32 { //create Compute Task resp, err := ctb.masterHttp.CreateTaskAsync(client.TaskCreateRequest{ @@ -108,6 +113,11 @@ func (ctb *ComputeTaskBase) SendComputeReqAsyncNotWait(params map[string]string) return int32(resp.Task.ID) } +/* +* @Description: SendComputeReqAsyncNotWaitWithError sends a compute request asynchronously and returns the task ID and error. +* @Param params +* @Return int32, error + */ func (ctb *ComputeTaskBase) SendComputeReqAsyncNotWaitWithError(params map[string]string) (int32, error) { //create Compute Task resp, err := ctb.masterHttp.CreateTaskAsync(client.TaskCreateRequest{ @@ -121,6 +131,12 @@ func (ctb *ComputeTaskBase) SendComputeReqAsyncNotWaitWithError(params map[strin return int32(resp.Task.ID), nil } +/* +* @Description: SendComputeReqAsyncBatchPriority sends a compute request asynchronously and returns the task ID and sequence. +* @Note: This function will block the main thread until all tasks are completed. +* @Param params +* @Return []int32, []int32 + */ func (ctb *ComputeTaskBase) SendComputeReqAsyncBatchPriority(params []map[string]string) ([]int32, []int32) { //create Compute Task tasks := make([]client.TaskInfo, 0, len(params)) diff --git a/vermeer/test/functional/http_interface.go b/vermeer/test/functional/http_interface.go index 30b5ecb81..199261e86 100644 --- a/vermeer/test/functional/http_interface.go +++ b/vermeer/test/functional/http_interface.go @@ -76,6 +76,12 @@ func (ct CancelTask) CancelTask(t *testing.T, master *client.VermeerClient, grap require.Equal(t, "canceled", task.Task.Status) } +/* +* @Description: DirectCancelTask cancels a task directly. +* @Param t +* @Param master +* @Param taskID + */ func (ct CancelTask) DirectCancelTask(t *testing.T, master *client.VermeerClient, taskID int32) { ok, err := master.GetTaskCancel(int(taskID)) require.NoError(t, err) diff --git a/vermeer/test/scheduler/batch.go b/vermeer/test/scheduler/batch.go index 7afaad7a8..565bfea29 100644 --- a/vermeer/test/scheduler/batch.go +++ b/vermeer/test/scheduler/batch.go @@ -6,6 +6,16 @@ import ( "vermeer/test/functional" ) +/* +* @Description: This is the main test function for batch. +* @Param t +* @Param expectRes +* @Param healthCheck +* @Param masterHttp +* @Param graphName +* @Param factor +* @Param waitSecond + */ func TestBatch(t *testing.T, expectRes *functional.ExpectRes, healthCheck *functional.HealthCheck, masterHttp *client.VermeerClient, graphName []string, factor string, waitSecond int) { // TEST GROUP: BATCH // 1. send batch tasks to single graph diff --git a/vermeer/test/scheduler/priority.go b/vermeer/test/scheduler/priority.go index 63256139e..ef68e3dda 100644 --- a/vermeer/test/scheduler/priority.go +++ b/vermeer/test/scheduler/priority.go @@ -14,6 +14,16 @@ import ( "github.com/stretchr/testify/require" ) +/* +* @Description: SubTestPriority tests the scheduler's behavior when submitting tasks with different priorities. +* @Param t +* @Param expectRes +* @Param healthCheck +* @Param masterHttp +* @Param graphName +* @Param computeTask +* @Param waitSecond + */ func SubTestPriority(t *testing.T, expectRes *functional.ExpectRes, healthCheck *functional.HealthCheck, masterHttp *client.VermeerClient, graphName []string, computeTask string, waitSecond int) { fmt.Printf("Test Priority start with task: %s\n", computeTask) bTime := time.Now() @@ -47,6 +57,16 @@ func SubTestPriority(t *testing.T, expectRes *functional.ExpectRes, healthCheck fmt.Printf("Test Priority: %-30s [OK], cost: %v\n", computeTask, time.Since(bTime)) } +/* +* @Description: SubTestSmall tests the scheduler's behavior when submitting tasks with different sizes. +* @Param t +* @Param expectRes +* @Param healthCheck +* @Param masterHttp +* @Param graphName +* @Param computeTask +* @Param waitSecond + */ func SubTestSmall(t *testing.T, expectRes *functional.ExpectRes, healthCheck *functional.HealthCheck, masterHttp *client.VermeerClient, graphName []string, computeTask string, waitSecond int) { fmt.Printf("Test Small start with task: %s\n", computeTask) bTime := time.Now() @@ -78,6 +98,16 @@ func SubTestSmall(t *testing.T, expectRes *functional.ExpectRes, healthCheck *fu fmt.Printf("Test Small: %-30s [OK], cost: %v\n", computeTask, time.Since(bTime)) } +/* +* @Description: SubTestConcurrent tests the scheduler's behavior when submitting tasks with different sizes. +* @Param t +* @Param expectRes +* @Param healthCheck +* @Param masterHttp +* @Param graphName +* @Param computeTask +* @Param waitSecond + */ func SubTestConcurrent(t *testing.T, expectRes *functional.ExpectRes, healthCheck *functional.HealthCheck, masterHttp *client.VermeerClient, graphName []string, computeTask string, waitSecond int) { fmt.Printf("Test Concurrent start with task: %s\n", computeTask) bTime := time.Now() @@ -103,6 +133,16 @@ func SubTestConcurrent(t *testing.T, expectRes *functional.ExpectRes, healthChec // cost should be less than 2 * single task time } +/* +* @Description: SubTestDepends tests the scheduler's behavior when submitting tasks with different dependencies. +* @Param t +* @Param expectRes +* @Param healthCheck +* @Param masterHttp +* @Param graphName +* @Param computeTask +* @Param waitSecond + */ func SubTestDepends(t *testing.T, expectRes *functional.ExpectRes, healthCheck *functional.HealthCheck, masterHttp *client.VermeerClient, graphName []string, computeTask string, waitSecond int) { fmt.Printf("Test Depends start with task: %s\n", computeTask) bTime := time.Now() @@ -145,8 +185,16 @@ func SubTestDepends(t *testing.T, expectRes *functional.ExpectRes, healthCheck * fmt.Printf("Test Depends: %-30s [OK], cost: %v\n", computeTask, time.Since(bTime)) } -// SubTestInvalidDependency 测试当任务依赖一个不存在的任务ID时,调度器的行为。 -// 调度器应该拒绝此任务,并返回一个错误。 +/* +* @Description: SubTestInvalidDependency tests the scheduler's behavior when a compute task is submitted with a dependency on a non-existent (invalid) task ID. +* @Param t +* @Param expectRes +* @Param healthCheck +* @Param masterHttp +* @Param graphName +* @Param computeTask +* @Param waitSecond + */ func SubTestInvalidDependency(t *testing.T, expectRes *functional.ExpectRes, healthCheck *functional.HealthCheck, masterHttp *client.VermeerClient, graphName []string, computeTask string, waitSecond int) { fmt.Printf("Test Invalid Dependency start with task: %s\n", computeTask) bTime := time.Now() @@ -156,23 +204,33 @@ func SubTestInvalidDependency(t *testing.T, expectRes *functional.ExpectRes, hea computeTest.Init(graphName[0], computeTask, expectRes, waitSecond, masterHttp, t, healthCheck) taskBody := computeTest.TaskComputeBody() - // 设置 preorders 为一个非常大的、理论上不存在的任务ID + // set preorders to a very large, theoretically nonexistent task ID invalidTaskID := 999999999 taskBody["preorders"] = fmt.Sprintf("%d", invalidTaskID) logrus.Infof("Attempting to submit a task with invalid dependency on ID: %d", invalidTaskID) - // 尝试异步提交任务,并检查是否返回了错误 + // try to submit task asynchronously and check if it returns an error taskID, err := computeTest.SendComputeReqAsyncNotWaitWithError(taskBody) - // 断言提交操作失败 + // assert that the submission operation failed require.Error(t, err, "Submitting a task with a non-existent dependency should return an error.") - // 断言返回的任务ID为0,或者其他表示失败的值 + // assert that the returned task ID is 0 or other failed values require.Equal(t, int32(-1), taskID, "The task ID should be zero or invalid on failure.") fmt.Printf("Test Invalid Dependency: %-30s [OK], cost: %v\n", computeTask, time.Since(bTime)) } +/* +* @Description: SubTestConcurrentCancellation tests the scheduler's behavior when submitting tasks concurrently and canceling them. +* @Param t +* @Param expectRes +* @Param healthCheck +* @Param masterHttp +* @Param graphName +* @Param computeTask +* @Param waitSecond + */ func SubTestConcurrentCancellation(t *testing.T, expectRes *functional.ExpectRes, healthCheck *functional.HealthCheck, masterHttp *client.VermeerClient, graphName []string, computeTask string, waitSecond int) { fmt.Printf("Test Concurrent Cancellation start with task: %s\n", computeTask) bTime := time.Now() @@ -181,7 +239,7 @@ func SubTestConcurrentCancellation(t *testing.T, expectRes *functional.ExpectRes require.NoError(t, err) computeTest.Init(graphName[0], computeTask, expectRes, waitSecond, masterHttp, t, healthCheck) - // 设置任务数量 + // set task number const numTasks = 20 taskBodies := make([]map[string]string, numTasks) for i := 0; i < numTasks; i++ { @@ -191,7 +249,7 @@ func SubTestConcurrentCancellation(t *testing.T, expectRes *functional.ExpectRes taskIDs := make(chan int32, numTasks) var wg sync.WaitGroup - // 1. 并发提交任务 + // 1. submit tasks concurrently for i := 0; i < numTasks; i++ { wg.Add(1) go func(body map[string]string) { @@ -219,10 +277,8 @@ func SubTestConcurrentCancellation(t *testing.T, expectRes *functional.ExpectRes cancelTask := functional.CancelTask{} cancelTask.DirectCancelTask(t, masterHttp, submittedTaskIDs[len(submittedTaskIDs)-1]) - // 3. 验证任务状态 - // 这里需要一个循环来检查所有任务的最终状态 - // 实际实现中,您可能需要根据调度器的API来轮询任务状态 - // 在这个示例中,我们只做基本的断言,因为没有实际的取消和状态查询逻辑 + // 3. verify task status + // wait for tasks to settle logrus.Info("Waiting for tasks to settle...") time.Sleep(time.Duration(waitSecond) * time.Second) @@ -239,6 +295,16 @@ func SubTestConcurrentCancellation(t *testing.T, expectRes *functional.ExpectRes fmt.Printf("Test Concurrent Cancellation: %-30s [OK], cost: %v\n", computeTask, time.Since(bTime)) } +/* +* @Description: This is the main test function for priority. +* @Param t +* @Param expectRes +* @Param healthCheck +* @Param masterHttp +* @Param graphName +* @Param computeTask +* @Param waitSecond + */ func TestPriority(t *testing.T, expectRes *functional.ExpectRes, healthCheck *functional.HealthCheck, masterHttp *client.VermeerClient, graphName []string, factor string, waitSecond int) { fmt.Print("start test priority\n") @@ -273,7 +339,11 @@ func TestPriority(t *testing.T, expectRes *functional.ExpectRes, healthCheck *fu // expect: the tasks should be executed concurrently // have been tested in SubTestSmall and SubTestDepends + // 7. send tasks with invalid dependency to single graph + // expect: the tasks should not be executed SubTestInvalidDependency(t, expectRes, healthCheck, masterHttp, graphName, computeTask, waitSecond) + // 8. send tasks concurrently and cancel them + // expect: the tasks should be cancelled SubTestConcurrentCancellation(t, expectRes, healthCheck, masterHttp, graphName, computeTask, 3) } diff --git a/vermeer/test/scheduler/routine.go b/vermeer/test/scheduler/routine.go index 5cb2c3b5c..55722bcee 100644 --- a/vermeer/test/scheduler/routine.go +++ b/vermeer/test/scheduler/routine.go @@ -11,6 +11,16 @@ import ( "github.com/stretchr/testify/require" ) +/* +* @Description: SubTestRoutine tests the scheduler's behavior when submitting tasks with cron expression. +* @Param t +* @Param expectRes +* @Param healthCheck +* @Param masterHttp +* @Param graphName +* @Param computeTask +* @Param waitSecond + */ func SubTestRoutine(t *testing.T, expectRes *functional.ExpectRes, healthCheck *functional.HealthCheck, masterHttp *client.VermeerClient, graphName []string, computeTask string, waitSecond int) { fmt.Printf("Test Routine start with task: %s\n", computeTask) bTime := time.Now() diff --git a/vermeer/test/scheduler/test_scheduler.go b/vermeer/test/scheduler/test_scheduler.go index 84c2abf37..ea05b258f 100644 --- a/vermeer/test/scheduler/test_scheduler.go +++ b/vermeer/test/scheduler/test_scheduler.go @@ -12,6 +12,16 @@ import ( "vermeer/test/functional" ) +/* +* @Description: This is the main test function for scheduler. +* @Param t +* @Param expectResPath +* @Param masterHttpAddr +* @Param graphName +* @Param factor +* @Param waitSecond +* @Note: You must start at least two worker, named worker01 and worker04 in your config.yaml + */ func TestScheduler(t *testing.T, expectResPath string, masterHttpAddr string, graphName string, factor string, waitSecond int) { fmt.Print("start test scheduler\n") From 5995cf8991c34142915ef26dbab1c397f909ced8 Mon Sep 17 00:00:00 2001 From: ethereal Date: Fri, 26 Sep 2025 14:18:45 +0800 Subject: [PATCH 25/27] chore: repair ai --- vermeer/apps/master/bl/scheduler_bl.go | 2 +- vermeer/apps/master/schedules/broker.go | 13 ++++++------- .../master/schedules/scheduler_resource_manager.go | 5 +++++ .../apps/master/schedules/scheduler_task_manager.go | 2 ++ 4 files changed, 14 insertions(+), 8 deletions(-) diff --git a/vermeer/apps/master/bl/scheduler_bl.go b/vermeer/apps/master/bl/scheduler_bl.go index b29fd700e..4fd5df97c 100644 --- a/vermeer/apps/master/bl/scheduler_bl.go +++ b/vermeer/apps/master/bl/scheduler_bl.go @@ -317,8 +317,8 @@ func (s *ScheduleBl) BatchQueueTask(taskInfos []*structure.TaskInfo) ([]bool, [] ok, err := s.QueueTask(taskInfo) if err != nil { logrus.Errorf("failed to queue task '%d': %v", taskInfo.ID, err) - errors = append(errors, err) } + errors = append(errors, err) oks = append(oks, ok) } diff --git a/vermeer/apps/master/schedules/broker.go b/vermeer/apps/master/schedules/broker.go index 7cecac75e..435219dc9 100644 --- a/vermeer/apps/master/schedules/broker.go +++ b/vermeer/apps/master/schedules/broker.go @@ -24,7 +24,6 @@ import ( "github.com/sirupsen/logrus" "vermeer/apps/master/workers" - . "vermeer/apps/master/workers" ) type AgentStatus string @@ -127,7 +126,7 @@ func (b *Broker) ApplyAgent(taskInfo *structure.TaskInfo, forceApply ...bool) (* // } // } -func (b *Broker) isWorkersReady(workers map[string]*WorkerClient) bool { +func (b *Broker) isWorkersReady(workers map[string]*workers.WorkerClient) bool { ok := false for _, w := range workers { if w.Connection == nil { @@ -169,7 +168,7 @@ func (b *Broker) isAgentBusy(agent *Agent) bool { return busy } -func (b *Broker) isWorkerBusy(workers map[string]*WorkerClient, agent *Agent) bool { +func (b *Broker) isWorkerBusy(workers map[string]*workers.WorkerClient, agent *Agent) bool { for _, a := range b.agents { if a == agent { continue @@ -191,7 +190,7 @@ func (b *Broker) isWorkerBusy(workers map[string]*WorkerClient, agent *Agent) bo return false } -func (b *Broker) getAgent(taskInfo *structure.TaskInfo) (*Agent, map[string]*WorkerClient, error) { +func (b *Broker) getAgent(taskInfo *structure.TaskInfo) (*Agent, map[string]*workers.WorkerClient, error) { switch taskInfo.Type { case structure.TaskTypeLoad: fallthrough @@ -205,7 +204,7 @@ func (b *Broker) getAgent(taskInfo *structure.TaskInfo) (*Agent, map[string]*Wor } -func (b *Broker) getAgentFromGraph(taskInfo *structure.TaskInfo) (*Agent, map[string]*WorkerClient, error) { +func (b *Broker) getAgentFromGraph(taskInfo *structure.TaskInfo) (*Agent, map[string]*workers.WorkerClient, error) { graph := graphMgr.GetGraphByName(taskInfo.SpaceName, taskInfo.GraphName) if graph == nil { return nil, nil, fmt.Errorf("failed to retrieve graph with name: %s/%s", taskInfo.SpaceName, taskInfo.GraphName) @@ -226,7 +225,7 @@ func (b *Broker) getAgentFromGraph(taskInfo *structure.TaskInfo) (*Agent, map[st return nil, nil, nil // waiting for the next check } - workers := make(map[string]*WorkerClient) + workers := make(map[string]*workers.WorkerClient) for _, w := range graph.Workers { wc := workerMgr.GetWorker(w.Name) @@ -241,7 +240,7 @@ func (b *Broker) getAgentFromGraph(taskInfo *structure.TaskInfo) (*Agent, map[st } -func (b *Broker) getAgentFromWorker(taskInfo *structure.TaskInfo) (*Agent, map[string]*WorkerClient, error) { +func (b *Broker) getAgentFromWorker(taskInfo *structure.TaskInfo) (*Agent, map[string]*workers.WorkerClient, error) { group := workerMgr.ApplyGroup(taskInfo.SpaceName, taskInfo.GraphName) return b.retrieveAgent(group), workerMgr.GroupWorkerMap(group), nil } diff --git a/vermeer/apps/master/schedules/scheduler_resource_manager.go b/vermeer/apps/master/schedules/scheduler_resource_manager.go index f74b4c591..49a353d28 100644 --- a/vermeer/apps/master/schedules/scheduler_resource_manager.go +++ b/vermeer/apps/master/schedules/scheduler_resource_manager.go @@ -191,6 +191,11 @@ func (rm *SchedulerResourceManager) changeWorkerStatus(workerName string, status if status == WorkerOngoingStatusIdle || status == WorkerOngoingStatusConcurrentRunning { workerInfo := workerMgr.GetWorkerInfo(workerName) + if workerInfo == nil { + logrus.Warnf("worker '%s' not found", workerName) + return + } + // get worker group name groupName := workerInfo.Group if groupName != "" { diff --git a/vermeer/apps/master/schedules/scheduler_task_manager.go b/vermeer/apps/master/schedules/scheduler_task_manager.go index 2b331528f..267c473d5 100644 --- a/vermeer/apps/master/schedules/scheduler_task_manager.go +++ b/vermeer/apps/master/schedules/scheduler_task_manager.go @@ -103,6 +103,7 @@ func (t *SchedulerTaskManager) RemoveTask(taskID int32) error { if _, exists := t.allTaskMap[taskID]; !exists { return errors.New("task not found") } + defer t.Unlock(t.Lock()) delete(t.allTaskMap, taskID) // remove from queue for i, task := range t.allTaskQueue { @@ -126,6 +127,7 @@ func (t *SchedulerTaskManager) MarkTaskComplete(taskID int32) error { if _, exists := t.allTaskMap[taskID]; !exists { return errors.New("task not found") } + defer t.Unlock(t.Lock()) delete(t.notCompleteTasks, taskID) return nil } From c5c8e150ca26289b4775ead9cbb574337c938d78 Mon Sep 17 00:00:00 2001 From: ethereal Date: Fri, 26 Sep 2025 15:10:21 +0800 Subject: [PATCH 26/27] chore: repair ai --- .../schedules/scheduler_resource_manager.go | 27 ++++++++++++------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/vermeer/apps/master/schedules/scheduler_resource_manager.go b/vermeer/apps/master/schedules/scheduler_resource_manager.go index 49a353d28..485659f03 100644 --- a/vermeer/apps/master/schedules/scheduler_resource_manager.go +++ b/vermeer/apps/master/schedules/scheduler_resource_manager.go @@ -199,19 +199,28 @@ func (rm *SchedulerResourceManager) changeWorkerStatus(workerName string, status // get worker group name groupName := workerInfo.Group if groupName != "" { - // check all workers in this group are idle - allIdleOrConcurrent := true - for _, w := range workerMgr.GetGroupWorkers(groupName) { - if rm.workerStatus[w.Name] != WorkerOngoingStatusIdle && rm.workerStatus[w.Name] != WorkerOngoingStatusConcurrentRunning { - allIdleOrConcurrent = false - break + gws := workerMgr.GetGroupWorkers(groupName) + allIdle := true + allConcurrent := true + for _, w := range gws { + st := rm.workerStatus[w.Name] + if st != WorkerOngoingStatusIdle { + allIdle = false + } + if st != WorkerOngoingStatusConcurrentRunning { + allConcurrent = false } } - if allIdleOrConcurrent { - logrus.Debugf("Change worker group '%s' status to '%s' because all %d workers are idle or concurrent running", groupName, status, len(workerMgr.GetGroupWorkers(groupName))) - rm.changeWorkerGroupStatus(groupName, status) + if allConcurrent || allIdle { + newStatus := WorkerOngoingStatusIdle + if allConcurrent { + newStatus = WorkerOngoingStatusConcurrentRunning + } + logrus.Debugf("Change worker group '%s' status to '%s' (derived from %d workers)", groupName, newStatus, len(gws)) + rm.changeWorkerGroupStatus(groupName, newStatus) } } + } else if status == WorkerOngoingStatusDeleted { delete(rm.workerStatus, workerName) } From 5ee6421581cd62fe2ad7a49abde14d69cdf0eac7 Mon Sep 17 00:00:00 2001 From: ethereal Date: Sat, 18 Oct 2025 15:50:14 +0800 Subject: [PATCH 27/27] chore: some tiny error --- vermeer/apps/master/bl/scheduler_bl.go | 7 ++++-- vermeer/apps/master/bl/task_bl.go | 6 ++++- .../schedules/scheduler_algorithm_manager.go | 17 +++++++++++++ .../schedules/scheduler_cron_manager.go | 17 +++++++++++++ .../schedules/scheduler_resource_manager.go | 17 +++++++++++++ .../schedules/scheduler_task_manager.go | 24 ++++++++++++++++++- vermeer/docker-compose.yaml | 17 +++++++++++++ vermeer/test/scheduler/batch.go | 17 +++++++++++++ vermeer/test/scheduler/priority.go | 17 +++++++++++++ vermeer/test/scheduler/routine.go | 17 +++++++++++++ vermeer/test/scheduler/test_scheduler.go | 17 +++++++++++++ 11 files changed, 169 insertions(+), 4 deletions(-) diff --git a/vermeer/apps/master/bl/scheduler_bl.go b/vermeer/apps/master/bl/scheduler_bl.go index 4fd5df97c..955789491 100644 --- a/vermeer/apps/master/bl/scheduler_bl.go +++ b/vermeer/apps/master/bl/scheduler_bl.go @@ -19,6 +19,7 @@ package bl import ( "errors" + "fmt" "strconv" "time" "vermeer/apps/common" @@ -201,7 +202,9 @@ func (s *ScheduleBl) tryScheduleInner(softSchedule bool, noLock ...bool) error { case s.startChan <- task: logrus.Infof("task '%d' sent to start channel", task.ID) default: - logrus.Warnf("start channel is full, task '%d' could not be sent", task.ID) + errMsg := fmt.Sprintf("start channel is full, cannot schedule task %d", task.ID) + logrus.Errorf(errMsg) + taskMgr.SetError(task, errMsg) } } @@ -368,7 +371,7 @@ func (s *ScheduleBl) ChangeWorkerStatus(workerName string, status schedules.Work func (s *ScheduleBl) waitingStartedTask() { for taskInfo := range s.startChan { if taskInfo == nil { - logrus.Warnf("recieved a nil task from startChan") + logrus.Warnf("received a nil task from startChan") continue } diff --git a/vermeer/apps/master/bl/task_bl.go b/vermeer/apps/master/bl/task_bl.go index 49724fba6..1c7e43c9a 100644 --- a/vermeer/apps/master/bl/task_bl.go +++ b/vermeer/apps/master/bl/task_bl.go @@ -20,6 +20,7 @@ package bl import ( "errors" "fmt" + "math" "sort" "strconv" "strings" @@ -74,6 +75,9 @@ func (tb *TaskBl) CreateTaskInfo( if p < 0 { return nil, fmt.Errorf("priority should be non-negative") } + if p > math.MaxInt32 { + return nil, fmt.Errorf("priority exceeds maximum value: %d", math.MaxInt32) + } taskInfo.Priority = int32(p) } else { logrus.Warnf("priority convert to int32 error:%v", err) @@ -85,7 +89,7 @@ func (tb *TaskBl) CreateTaskInfo( for _, preorder := range preorderList { if pid, err := strconv.ParseInt(preorder, 10, 32); err == nil { if taskMgr.GetTaskByID(int32(pid)) == nil { - return nil, fmt.Errorf("preorder task id %d not exists", pid) + return nil, fmt.Errorf("preorder task with ID %d does not exist", pid) } taskInfo.Preorders = append(taskInfo.Preorders, int32(pid)) } else { diff --git a/vermeer/apps/master/schedules/scheduler_algorithm_manager.go b/vermeer/apps/master/schedules/scheduler_algorithm_manager.go index 1d50a4509..c65e9f4de 100644 --- a/vermeer/apps/master/schedules/scheduler_algorithm_manager.go +++ b/vermeer/apps/master/schedules/scheduler_algorithm_manager.go @@ -1,3 +1,20 @@ +/* +Licensed to the Apache Software Foundation (ASF) under one or more +contributor license agreements. See the NOTICE file distributed with this +work for additional information regarding copyright ownership. The ASF +licenses this file to You under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +License for the specific language governing permissions and limitations +under the License. +*/ + package schedules import ( diff --git a/vermeer/apps/master/schedules/scheduler_cron_manager.go b/vermeer/apps/master/schedules/scheduler_cron_manager.go index 12ad5dbd7..651e1b9e4 100644 --- a/vermeer/apps/master/schedules/scheduler_cron_manager.go +++ b/vermeer/apps/master/schedules/scheduler_cron_manager.go @@ -1,3 +1,20 @@ +/* +Licensed to the Apache Software Foundation (ASF) under one or more +contributor license agreements. See the NOTICE file distributed with this +work for additional information regarding copyright ownership. The ASF +licenses this file to You under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +License for the specific language governing permissions and limitations +under the License. +*/ + package schedules import ( diff --git a/vermeer/apps/master/schedules/scheduler_resource_manager.go b/vermeer/apps/master/schedules/scheduler_resource_manager.go index 485659f03..8dabd3d08 100644 --- a/vermeer/apps/master/schedules/scheduler_resource_manager.go +++ b/vermeer/apps/master/schedules/scheduler_resource_manager.go @@ -1,3 +1,20 @@ +/* +Licensed to the Apache Software Foundation (ASF) under one or more +contributor license agreements. See the NOTICE file distributed with this +work for additional information regarding copyright ownership. The ASF +licenses this file to You under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +License for the specific language governing permissions and limitations +under the License. +*/ + package schedules import ( diff --git a/vermeer/apps/master/schedules/scheduler_task_manager.go b/vermeer/apps/master/schedules/scheduler_task_manager.go index 267c473d5..21767a391 100644 --- a/vermeer/apps/master/schedules/scheduler_task_manager.go +++ b/vermeer/apps/master/schedules/scheduler_task_manager.go @@ -1,7 +1,25 @@ +/* +Licensed to the Apache Software Foundation (ASF) under one or more +contributor license agreements. See the NOTICE file distributed with this +work for additional information regarding copyright ownership. The ASF +licenses this file to You under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +License for the specific language governing permissions and limitations +under the License. +*/ + package schedules import ( "errors" + "vermeer/apps/common" "vermeer/apps/structure" "github.com/sirupsen/logrus" @@ -86,6 +104,10 @@ func (t *SchedulerTaskManager) RefreshTaskToWorkerGroupMap() { * @Return error */ func (t *SchedulerTaskManager) AddTaskStartSequence(taskID int32) error { + if common.GetConfig("debug_mode").(string) != "debug" { + logrus.Warn("TaskStartSequence called but debug features are disabled") + return nil + } if _, exists := t.allTaskMap[taskID]; !exists { return errors.New("task not found") } @@ -202,7 +224,7 @@ func (t *SchedulerTaskManager) GetAllTasksNotComplete() []*structure.TaskInfo { return tasks } -func (t *SchedulerTaskManager) GetAllTasksWaitng() []*structure.TaskInfo { +func (t *SchedulerTaskManager) GetAllTasksWaiting() []*structure.TaskInfo { tasks := make([]*structure.TaskInfo, 0, len(t.allTaskMap)) for _, task := range t.GetAllTasksNotComplete() { if task.State == structure.TaskStateWaiting { diff --git a/vermeer/docker-compose.yaml b/vermeer/docker-compose.yaml index 35a506170..2cf90f6ae 100644 --- a/vermeer/docker-compose.yaml +++ b/vermeer/docker-compose.yaml @@ -1,3 +1,20 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + version: '3.8' services: diff --git a/vermeer/test/scheduler/batch.go b/vermeer/test/scheduler/batch.go index 565bfea29..e2d6611e1 100644 --- a/vermeer/test/scheduler/batch.go +++ b/vermeer/test/scheduler/batch.go @@ -1,3 +1,20 @@ +/* +Licensed to the Apache Software Foundation (ASF) under one or more +contributor license agreements. See the NOTICE file distributed with this +work for additional information regarding copyright ownership. The ASF +licenses this file to You under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +License for the specific language governing permissions and limitations +under the License. +*/ + package scheduler import ( diff --git a/vermeer/test/scheduler/priority.go b/vermeer/test/scheduler/priority.go index ef68e3dda..f15da6c9d 100644 --- a/vermeer/test/scheduler/priority.go +++ b/vermeer/test/scheduler/priority.go @@ -1,3 +1,20 @@ +/* +Licensed to the Apache Software Foundation (ASF) under one or more +contributor license agreements. See the NOTICE file distributed with this +work for additional information regarding copyright ownership. The ASF +licenses this file to You under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +License for the specific language governing permissions and limitations +under the License. +*/ + package scheduler import ( diff --git a/vermeer/test/scheduler/routine.go b/vermeer/test/scheduler/routine.go index 55722bcee..e5ced6bb6 100644 --- a/vermeer/test/scheduler/routine.go +++ b/vermeer/test/scheduler/routine.go @@ -1,3 +1,20 @@ +/* +Licensed to the Apache Software Foundation (ASF) under one or more +contributor license agreements. See the NOTICE file distributed with this +work for additional information regarding copyright ownership. The ASF +licenses this file to You under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +License for the specific language governing permissions and limitations +under the License. +*/ + package scheduler import ( diff --git a/vermeer/test/scheduler/test_scheduler.go b/vermeer/test/scheduler/test_scheduler.go index ea05b258f..7d0274cbe 100644 --- a/vermeer/test/scheduler/test_scheduler.go +++ b/vermeer/test/scheduler/test_scheduler.go @@ -1,3 +1,20 @@ +/* +Licensed to the Apache Software Foundation (ASF) under one or more +contributor license agreements. See the NOTICE file distributed with this +work for additional information regarding copyright ownership. The ASF +licenses this file to You under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +License for the specific language governing permissions and limitations +under the License. +*/ + package scheduler import (