← Documentation Home

Agent Mesh Architecture

Backend coordination layer for distributed AI agents and microservices

Overview

Agent Mesh is a backend coordination layer that enables distributed communication and orchestration between AI agents and microservices. It provides high-performance gRPC communication, MCP integration, agent discovery, intelligent load balancing, and comprehensive observability for multi-agent systems.

IMPORTANT: Agent Mesh is a backend service, not an IDE client or VSCode extension. For IDE integration, see AgentStudio-vscode.

OSSA Compliance: 1.0 License: MIT

Architecture Overview

graph TB
    subgraph "Client Layer"
        CLI[BuildKit CLI]
        Studio[AgentStudio IDE]
        Apps[Custom Applications]
    end

    subgraph "API Layer"
        REST[REST API<br/>Port 3005]
        WS[WebSocket<br/>Real-time Updates]
    end

    subgraph "Mesh Core"
        gRPC[gRPC Server<br/>Port 50051]
        Registry[Agent Registry]
        Router[Request Router]
        LB[Load Balancer]
    end

    subgraph "MCP Integration"
        MCPServer[MCP Servers]
        MCPRegistry[MCP Registry]
        ToolOrch[Tool Orchestrator]
    end

    subgraph "Orchestration"
        Discovery[Agent Discovery]
        Health[Health Monitor]
        Circuit[Circuit Breaker]
        Rate[Rate Limiter]
    end

    subgraph "Agent Pool"
        Workers[Worker Agents]
        Governors[Governor Agents]
        Critics[Critic Agents]
        Observers[Observer Agents]
    end

    subgraph "Observability"
        Phoenix[Phoenix Arize]
        OTLP[OpenTelemetry]
        Prom[Prometheus]
        Jaeger[Jaeger]
    end

    CLI --> REST
    Studio --> REST
    Apps --> gRPC

    REST --> Router
    WS --> Router
    gRPC --> Router

    Router --> Registry
    Router --> LB

    LB --> Discovery
    Discovery --> Health
    Health --> Circuit
    Circuit --> Rate

    Rate --> Workers
    Rate --> Governors
    Rate --> Critics
    Rate --> Observers

    Router --> MCPServer
    MCPServer --> MCPRegistry
    MCPRegistry --> ToolOrch

    Workers --> Phoenix
    Router --> OTLP
    OTLP --> Prom
    OTLP --> Jaeger

Core Components

gRPC Server

High-performance bidirectional streaming for agent-to-agent communication.

Configuration

# config/grpc.yaml
server:
  host: 0.0.0.0
  port: 50051
  maxConcurrentStreams: 100
  keepaliveTime: 30s
  keepaliveTimeout: 10s

security:
  mtls:
    enabled: true
    certPath: /etc/certs/server.crt
    keyPath: /etc/certs/server.key
    caPath: /etc/certs/ca.crt

performance:
  maxMessageSize: 4194304  # 4MB
  compression: gzip
  connectionPoolSize: 100

Protocol Buffers

// proto/agent-mesh.proto
syntax = "proto3";

package agent_mesh.v1;

service AgentService {
  rpc RegisterAgent(RegisterAgentRequest) returns (RegisterAgentResponse);
  rpc DiscoverAgents(DiscoverAgentsRequest) returns (DiscoverAgentsResponse);
  rpc StreamAgentStatus(stream AgentStatusUpdate) returns (stream AgentStatusUpdate);
}

service OrchestrationService {
  rpc OrchestrateTasks(OrchestrationRequest) returns (OrchestrationResponse);
  rpc StreamTaskUpdates(TaskSubscription) returns (stream TaskUpdate);
}

service MCPService {
  rpc ListMCPServers(ListMCPServersRequest) returns (ListMCPServersResponse);
  rpc InvokeTool(InvokeToolRequest) returns (InvokeToolResponse);
}

message RegisterAgentRequest {
  string agent_id = 1;
  string agent_type = 2;
  repeated string capabilities = 3;
  map<string, string> metadata = 4;
}

message OrchestrationRequest {
  string task_id = 1;
  repeated string agent_ids = 2;
  string coordination_strategy = 3;
  bytes task_payload = 4;
}

gRPC Client Example

import { AgentMeshClient } from '@bluefly/agent-mesh'

// Initialize gRPC client
const client = new AgentMeshClient({
  address: 'localhost:50051',
  tls: {
    cert: fs.readFileSync('/path/to/client.crt'),
    key: fs.readFileSync('/path/to/client.key'),
    ca: fs.readFileSync('/path/to/ca.crt')
  }
})

// Register agent
await client.registerAgent({
  agentId: 'tdd-enforcer-001',
  agentType: 'governor',
  capabilities: ['testing', 'validation', 'coverage-analysis'],
  metadata: {
    version: '1.0.0',
    runtime: 'kubernetes'
  }
})

// Stream agent status
const stream = client.streamAgentStatus()

stream.on('data', (status) => {
  console.log(`Agent ${status.agentId}: ${status.state}`)
})

// Orchestrate tasks
const response = await client.orchestrateTasks({
  taskId: 'task-123',
  agentIds: ['tdd-enforcer-001', 'api-builder-002'],
  coordinationStrategy: 'sequential',
  taskPayload: Buffer.from(JSON.stringify({ files: ['src/**/*.ts'] }))
})

Agent Registry

Central registry for agent discovery and capability management.

Agent Registration

// Backend: Agent self-registration
import { AgentRegistry } from '@bluefly/agent-mesh/backend/src/services/agent-registry'

const registry = new AgentRegistry({
  storage: 'redis',
  ttl: 60000,  // 1 minute heartbeat
  namespace: 'llm-agents'
})

// Register agent
await registry.register({
  id: 'tdd-enforcer-001',
  type: 'governor',
  capabilities: ['testing', 'validation'],
  endpoint: 'grpc://tdd-enforcer-001.agents.svc.cluster.local:50051',
  metadata: {
    ossa_version: '1.0',
    max_concurrent_tasks: 5
  }
})

// Heartbeat (keeps registration alive)
setInterval(async () => {
  await registry.heartbeat('tdd-enforcer-001')
}, 30000)

Agent Discovery

// Discover agents by capability
const agents = await registry.discover({
  capabilities: ['testing'],
  healthStatus: 'healthy',
  namespace: 'llm-agents'
})

// Discover specific agent
const agent = await registry.get('tdd-enforcer-001')

// Query by OSSA compliance
const ossaAgents = await registry.discover({
  filter: {
    'metadata.ossa_version': '1.0',
    'metadata.certified': true
  }
})

Request Router

Intelligent routing of requests to appropriate agents.

import { RequestRouter } from '@bluefly/agent-mesh/backend/src/mesh/request-router'

const router = new RequestRouter({
  loadBalancer: {
    strategy: 'least-loaded',
    healthCheckInterval: 10000
  },
  circuitBreaker: {
    failureThreshold: 5,
    timeout: 30000,
    resetTimeout: 60000
  },
  retry: {
    maxAttempts: 3,
    backoff: 'exponential',
    initialDelay: 1000
  }
})

// Route request to best available agent
const result = await router.route({
  capability: 'testing',
  payload: { file: 'src/api/users.ts' },
  timeout: 30000
})

Routing Strategies

Round Robin

router.setStrategy('round-robin', {
  weights: {
    'tdd-enforcer-001': 1,
    'tdd-enforcer-002': 1,
    'tdd-enforcer-003': 1
  }
})

Least Loaded

router.setStrategy('least-loaded', {
  metric: 'active_tasks',
  refreshInterval: 5000
})

Priority-Based

router.setStrategy('priority', {
  priorities: {
    'tdd-enforcer-001': 10,  // Highest priority
    'tdd-enforcer-002': 5,
    'tdd-enforcer-003': 1
  }
})

Capability-Based

router.setStrategy('capability', {
  required: ['testing', 'mutation-analysis'],
  preferred: ['fast-execution']
})

Load Balancer

Distributes requests across agent instances with intelligent health-aware routing.

import { LoadBalancer } from '@bluefly/agent-mesh/backend/src/services/load-balancer'

const lb = new LoadBalancer({
  healthCheck: {
    interval: 10000,
    timeout: 5000,
    unhealthyThreshold: 3,
    healthyThreshold: 2
  },
  weights: {
    cpuWeight: 0.4,
    memoryWeight: 0.3,
    taskWeight: 0.3
  }
})

// Get next available agent
const agent = await lb.selectAgent({
  capability: 'testing',
  preferLocal: true,
  maxLatency: 100  // milliseconds
})

// Load balancing metrics
const metrics = await lb.getMetrics()
// {
//   totalRequests: 10000,
//   distribution: {
//     'tdd-enforcer-001': 3500,
//     'tdd-enforcer-002': 3300,
//     'tdd-enforcer-003': 3200
//   },
//   avgLatency: 45,
//   errorRate: 0.01
// }

Health Monitor

Continuous health checking with automatic failover.

import { HealthMonitor } from '@bluefly/agent-mesh/backend/src/services/health-monitor'

const healthMonitor = new HealthMonitor({
  checkInterval: 10000,
  checks: [
    'heartbeat',
    'resource-usage',
    'task-completion-rate'
  ]
})

// Register health check
healthMonitor.registerCheck('tdd-enforcer-001', {
  type: 'grpc',
  endpoint: 'tdd-enforcer-001.agents.svc.cluster.local:50051',
  method: 'grpc.health.v1.Health/Check'
})

// Monitor health events
healthMonitor.on('unhealthy', (agent) => {
  console.log(`Agent ${agent.id} became unhealthy`)
  // Trigger failover
  router.markUnhealthy(agent.id)
})

healthMonitor.on('recovered', (agent) => {
  console.log(`Agent ${agent.id} recovered`)
  router.markHealthy(agent.id)
})

Circuit Breaker

Prevents cascading failures with automatic circuit breaking.

import { CircuitBreaker } from '@bluefly/agent-mesh/backend/src/mesh/circuit-breaker'

const breaker = new CircuitBreaker({
  failureThreshold: 5,      // Open after 5 failures
  timeout: 30000,           // 30s timeout
  resetTimeout: 60000,      // Try again after 60s
  monitoringPeriod: 10000   // 10s monitoring window
})

// Wrap agent call with circuit breaker
const result = await breaker.execute(async () => {
  return await agent.executeTask(task)
})

// Circuit states: CLOSED, OPEN, HALF_OPEN
breaker.on('open', () => {
  console.log('Circuit opened - too many failures')
})

breaker.on('half-open', () => {
  console.log('Circuit half-open - testing recovery')
})

breaker.on('closed', () => {
  console.log('Circuit closed - service recovered')
})

Rate Limiter

Prevents agent overload with configurable rate limiting.

import { RateLimiter } from '@bluefly/agent-mesh/backend/src/services/rate-limiter'

const limiter = new RateLimiter({
  strategy: 'token-bucket',
  limits: {
    'tdd-enforcer-001': {
      requestsPerSecond: 10,
      burstSize: 20
    },
    'api-builder-002': {
      requestsPerSecond: 5,
      burstSize: 10
    }
  }
})

// Check if request is allowed
const allowed = await limiter.allowRequest('tdd-enforcer-001')

if (!allowed) {
  throw new Error('Rate limit exceeded')
}

MCP Integration

MCP Server Registry

import { MCPRegistry } from '@bluefly/agent-mesh/backend/src/mcp/mcp-registry'

const mcpRegistry = new MCPRegistry()

// Register MCP server
await mcpRegistry.register({
  name: 'filesystem-server',
  url: 'http://localhost:3001',
  protocol: 'json-rpc-2.0',
  capabilities: ['read_file', 'write_file', 'list_directory']
})

// List available MCP servers
const servers = await mcpRegistry.list()

// Get specific server
const server = await mcpRegistry.get('filesystem-server')

MCP Tool Orchestration

import { MCPClient } from '@bluefly/agent-mesh/backend/src/mcp/mcp-client'

const mcpClient = new MCPClient({
  timeout: 30000,
  retries: 3
})

// Connect to MCP server
await mcpClient.connect('http://localhost:3001')

// List available tools
const tools = await mcpClient.listTools()

// Invoke tool
const result = await mcpClient.invokeTool('read_file', {
  path: '/path/to/file.ts'
})

// Tool orchestration across multiple servers
const orchestrated = await mcpClient.orchestrate([
  {
    server: 'filesystem-server',
    tool: 'read_file',
    args: { path: 'src/api/users.ts' }
  },
  {
    server: 'analysis-server',
    tool: 'analyze_code',
    args: { code: '${previous.result}' }
  }
])

MCP Configuration

# config/mcp.yaml
mcp:
  enabled: true
  timeout: 30000
  retries: 3

  servers:
    - name: filesystem-server
      url: http://localhost:3001
      protocol: json-rpc-2.0
      auth:
        type: bearer
        token: ${MCP_FILESYSTEM_TOKEN}

    - name: postgres-server
      url: http://localhost:3002
      protocol: json-rpc-2.0
      auth:
        type: basic
        username: ${MCP_POSTGRES_USER}
        password: ${MCP_POSTGRES_PASSWORD}

    - name: memory-server
      url: http://localhost:3003
      protocol: json-rpc-2.0

  discovery:
    enabled: true
    interval: 60000
    registry: redis://localhost:6379

Observability Integration

Phoenix Arize Tracing

import { PhoenixTracer } from '@bluefly/agent-mesh/backend/src/observability/phoenix-tracer'

const tracer = new PhoenixTracer({
  endpoint: 'http://localhost:6006',
  project: 'agent-mesh',
  enabled: true
})

// Trace agent communication
await tracer.trace({
  name: 'agent-mesh.orchestrate',
  attributes: {
    agentIds: ['tdd-enforcer-001', 'api-builder-002'],
    taskId: 'task-123'
  },
  fn: async () => {
    return await orchestrateTasks(...)
  }
})

OpenTelemetry Export

import { OTLPExporter } from '@bluefly/agent-mesh/backend/src/observability/otlp-exporter'

const otlp = new OTLPExporter({
  endpoint: 'http://localhost:4317',
  serviceName: 'agent-mesh',
  attributes: {
    'service.version': '1.0.0',
    'deployment.environment': 'production'
  }
})

// Export traces
await otlp.exportTraces(spans)

// Export metrics
await otlp.exportMetrics(metrics)

Prometheus Metrics

import { PrometheusExporter } from '@bluefly/agent-mesh/backend/src/observability/prometheus-exporter'

const prometheus = new PrometheusExporter({
  port: 9090,
  path: '/metrics',
  prefix: 'agent_mesh_'
})

// Register metrics
prometheus.registerCounter('requests_total', {
  help: 'Total number of requests',
  labelNames: ['agent_type', 'capability', 'status']
})

prometheus.registerHistogram('request_duration_seconds', {
  help: 'Request duration in seconds',
  labelNames: ['agent_type', 'capability'],
  buckets: [0.01, 0.05, 0.1, 0.5, 1, 2, 5]
})

// Increment counter
prometheus.increment('requests_total', {
  agent_type: 'governor',
  capability: 'testing',
  status: 'success'
})

// Observe histogram
prometheus.observe('request_duration_seconds', 0.45, {
  agent_type: 'governor',
  capability: 'testing'
})

CLI Commands

# Agent orchestration
agent-mesh orchestrate --agents agent1,agent2 --task "task-description"
agent-mesh discover --namespace llm-agents
agent-mesh health --all
agent-mesh route --from agent-a --to agent-b --strategy round-robin

# MCP operations
agent-mesh mcp list
agent-mesh mcp register --server mcp-server-url
agent-mesh mcp invoke --tool tool-name --args '{"key":"value"}'

# Observability
agent-mesh metrics --export prometheus
agent-mesh trace --agent-id agent-123
agent-mesh logs --follow --agent agent-name

REST API Reference

Agent Management

POST /api/v1/agents/register
Content-Type: application/json

{
  "agentId": "tdd-enforcer-001",
  "agentType": "governor",
  "capabilities": ["testing", "validation"],
  "metadata": {
    "version": "1.0.0"
  }
}
GET /api/v1/agents?capability=testing&healthStatus=healthy

Orchestration

POST /api/v1/orchestrate
Content-Type: application/json

{
  "taskId": "task-123",
  "agentIds": ["tdd-enforcer-001", "api-builder-002"],
  "coordinationStrategy": "sequential",
  "payload": {
    "files": ["src/**/*.ts"]
  }
}

MCP Operations

GET /api/v1/mcp/servers
POST /api/v1/mcp/invoke
Content-Type: application/json

{
  "server": "filesystem-server",
  "tool": "read_file",
  "args": {
    "path": "/path/to/file"
  }
}

Health & Metrics

GET /health
GET /metrics
Accept: text/plain

Security

mTLS Configuration

# Generate CA certificate
openssl genrsa -out ca.key 4096
openssl req -new -x509 -key ca.key -out ca.crt -days 365

# Generate server certificate
openssl genrsa -out server.key 2048
openssl req -new -key server.key -out server.csr
openssl x509 -req -in server.csr -CA ca.crt -CAkey ca.key -CAcreateserial -out server.crt -days 365

# Generate client certificate
openssl genrsa -out client.key 2048
openssl req -new -key client.key -out client.csr
openssl x509 -req -in client.csr -CA ca.crt -CAkey ca.key -CAcreateserial -out client.crt -days 365

Environment Variables

# Server configuration
AGENT_MESH_PORT=3005
AGENT_MESH_GRPC_PORT=50051
AGENT_MESH_MCP_ENABLED=true
AGENT_MESH_NAMESPACE=llm-agents

# Security
AGENT_MESH_MTLS_ENABLED=true
AGENT_MESH_CERT_PATH=/etc/certs/server.crt
AGENT_MESH_KEY_PATH=/etc/certs/server.key
AGENT_MESH_CA_PATH=/etc/certs/ca.crt

# Observability
PHOENIX_COLLECTOR_ENDPOINT=http://localhost:6006
JAEGER_ENDPOINT=http://localhost:14268/api/traces
PROMETHEUS_PORT=9090

Kubernetes Deployment

# Deploy to dev
kubectl apply -f infrastructure/kubernetes/dev/

# Deploy to production
kubectl apply -f infrastructure/kubernetes/prod/

# Using Helm
helm upgrade --install agent-mesh ./helm/