diff --git a/README.md b/README.md index bf209713..6629c499 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ A Model Context Protocol (MCP) server that provides tools for interacting with C ## Features - **Multiple Transport Options**: Support for STDIO, HTTP, and Server-Sent Events (SSE) transports -- **OAuth 2.0 Authorization**: Forward Bearer tokens to ClickHouse for token-based authentication via OIDC providers (see [OAuth 2.0 Documentation](docs/oauth_authorization.md)) +- **OAuth 2.0 Authorization**: Broker OAuth flows from MCP clients against any OIDC provider; auto-detects Bearer vs Basic auth for ClickHouse (see [OAuth 2.0 Documentation](docs/oauth_authorization.md)) - **JWE Authentication**: Optional JWE-based authentication with encryption for secure database access - **TLS Support**: Full TLS encryption support for both ClickHouse® connections and MCP server endpoints - **Comprehensive Tools**: Built-in tools for listing tables, describing schemas, and executing queries @@ -236,7 +236,7 @@ export LOG_LEVEL=debug # OAuth — env-var injection lets operators pull secrets from a Kubernetes # Secret via `valueFrom.secretKeyRef` instead of committing them to YAML. export MCP_OAUTH_ENABLED=true -export MCP_OAUTH_MODE=gating +export MCP_OAUTH_BROKER=true export MCP_OAUTH_ISSUER=https://accounts.example.com export MCP_OAUTH_SIGNING_SECRET=... @@ -357,7 +357,7 @@ JWE takes priority — if present and valid and has valid credentials, use it an ### OAuth 2.0 Authorization -The MCP server supports OAuth 2.0/OpenID Connect authentication, with token forwarding to ClickHouse or token verification locally. This enables MCP clients to authenticate via an Identity Provider (Keycloak, Azure AD, Google, AWS Cognito) and have their Bearer tokens forwarded to ClickHouse for token-based authentication via `token_processors`. +The MCP server supports OAuth 2.0/OpenID Connect authentication, with Bearer tokens presented to ClickHouse or verified locally. This enables MCP clients to authenticate via an Identity Provider (Keycloak, Azure AD, Google, AWS Cognito) and use token-based ClickHouse authentication via `token_processors`. For full setup instructions, provider-specific guides, and ClickHouse configuration, see the [OAuth 2.0 Authorization Documentation](docs/oauth_authorization.md). diff --git a/cmd/altinity-mcp/client_assertion_test.go b/cmd/altinity-mcp/client_assertion_test.go index 9f0c0a93..6f098c43 100644 --- a/cmd/altinity-mcp/client_assertion_test.go +++ b/cmd/altinity-mcp/client_assertion_test.go @@ -360,7 +360,7 @@ func TestHandleOAuthTokenAuthCode_LenientPrivateKeyJWT(t *testing.T) { // helper directly. parseCIMDMetadata only fails on bad shape, so // reaching here proves the parser accepts both methods (covered); // the lenient runtime branch is exercised by integration tests in - // oauth_regression_test.go via the broker_upstream flow. This unit + // oauth_regression_test.go via the broker flow. This unit // test guards the parser-side accept of "private_key_jwt" against a // future revert to strict-mode-only. } diff --git a/cmd/altinity-mcp/main.go b/cmd/altinity-mcp/main.go index 5f7b1e94..41076786 100644 --- a/cmd/altinity-mcp/main.go +++ b/cmd/altinity-mcp/main.go @@ -11,6 +11,7 @@ import ( "net/url" "os" "os/signal" + "reflect" "strings" "sync" "syscall" @@ -685,6 +686,12 @@ func (a *application) healthHandler(w http.ResponseWriter, r *http.Request) { "version": version, } + // Surface multi-cluster catalog-cache counters (there is no Prometheus + // subsystem; /health is the observability surface). nil in single-cluster. + if a.mcCache != nil { + status["catalog_cache"] = a.mcCache.Snapshot() + } + // Test ClickHouse connection for readiness, unless credentials are per-request credentialsArePerRequest := cfg.Server.JWE.Enabled || cfg.Server.OAuth.Enabled @@ -751,6 +758,7 @@ func buildConfig(cmd CommandInterface) (config.Config, error) { // Override with CLI flags (CLI flags take precedence over config file) overrideWithCLIFlags(&cfg, cmd) + config.ApplyMulticlusterDefaults(&cfg) if logErr := setupLogging(string(cfg.Logging.Level)); logErr != nil { return cfg, fmt.Errorf("failed setup logging %s level: %w", cfg.Logging.Level, logErr) } @@ -865,11 +873,6 @@ func warnOAuthMisconfiguration(cfg config.Config) { if !oauth.Enabled { return } - if oauth.IsGatingMode() && strings.TrimSpace(oauth.PublicAuthServerURL) == "" && strings.TrimSpace(oauth.Issuer) != "" { - log.Warn().Msg("OAuth gating mode: public_auth_server_url is not set — " + - "minted tokens will use the request Host as issuer, but validation expects the configured issuer; " + - "set public_auth_server_url to match, or leave issuer empty to skip issuer validation") - } // PublicResourceURL pins the canonical RFC 9728 `resource` URL (and the // audience the RFC 8707 resource indicator is validated against in // /authorize). When unset, we fall back to the request's Host header, @@ -882,17 +885,6 @@ func warnOAuthMisconfiguration(cfg config.Config) { "to the request Host header. For production deployments behind a single canonical " + "hostname, set MCP_OAUTH_PUBLIC_RESOURCE_URL to lock the resource identity.") } - // C-1 nudge: forward mode without any JWKS source means we cannot validate - // JWT bearers locally. The auth layer soft-passes such tokens to ClickHouse, - // which is then the sole validator. MCP authorization spec §Token Handling - // requires the resource server to validate tokens; configuring oauth_issuer - // or oauth_jwks_url enables that. - if oauth.IsForwardMode() && strings.TrimSpace(oauth.Issuer) == "" && strings.TrimSpace(oauth.JWKSURL) == "" { - log.Warn().Msg("OAuth forward mode: neither oauth_issuer nor oauth_jwks_url is set — " + - "JWT bearers cannot be validated locally (signature/iss/aud/exp); " + - "the MCP server will forward them as-is and rely entirely on ClickHouse-side validation. " + - "Set MCP_OAUTH_ISSUER or MCP_OAUTH_JWKS_URL to enable local validation per MCP authorization spec §Token Handling.") - } } // testConnection tests the connection to ClickHouse @@ -981,6 +973,12 @@ type application struct { // cimdResolver fetches and caches inbound CIMD client metadata documents. // Constructed in newApplication; tests inject an alternative resolver. cimdResolver *cimdResolver + // mcRouter and mcCache are set only when cfg.Multicluster.Enabled is + // true at process start. multicluster.* fields are restart-only — + // changing them via config reload logs a warning but does not rebuild + // these. + mcRouter *altinitymcp.MulticlusterRouter + mcCache *altinitymcp.CatalogCache } // setHTTPServer sets the HTTP server with proper synchronization @@ -1001,13 +999,14 @@ func newApplication(ctx context.Context, cfg config.Config, cmd CommandInterface if err := validateOAuthRuntimeConfig(cfg); err != nil { return nil, err } + if err := validateMulticlusterRuntimeConfig(cfg); err != nil { + return nil, err + } // Test connection to ClickHouse at startup, unless credentials are dynamic: // - JWE: each request carries its own ClickHouse credentials - // - OAuth (forward or gating): credentials are derived from the JWT per - // request. Forward mode rewrites the Authorization header; gating mode - // sends Basic base64(email:JWT) for the CH-side JWT-verifier sidecar to - // validate. The static helm-configured Username/Password are unused. + // - OAuth: credentials are derived from the bearer per request. The static + // helm-configured Username/Password are unused. skipStartupPing := cfg.Server.JWE.Enabled || cfg.Server.OAuth.Enabled if !skipStartupPing { log.Debug().Msg("Testing ClickHouse connection...") @@ -1059,6 +1058,24 @@ func newApplication(ctx context.Context, cfg config.Config, cmd CommandInterface cimdResolver: newCIMDResolver(nil), } + // Multi-cluster routing is restart-only. Build the router + catalog + // cache once during newApplication and stash them on the app; the + // HTTP server consults these on every request. + if cfg.Multicluster.Enabled { + router, err := altinitymcp.NewMulticlusterRouter(cfg.Multicluster, cfg.ClickHouse) + if err != nil { + return nil, fmt.Errorf("multicluster: failed to build router: %w", err) + } + app.mcRouter = router + app.mcCache = altinitymcp.NewCatalogCache(cfg.Multicluster) + log.Info(). + Strs("cluster_allowlist", cfg.Multicluster.ClusterAllowlist). + Int("catalog_cache_max", cfg.Multicluster.CatalogCacheMax). + Dur("catalog_ttl_fallback", cfg.Multicluster.CatalogTTLFallback). + Dur("catalog_negative_ttl", cfg.Multicluster.CatalogNegativeTTL). + Msg("multicluster routing enabled — /mcp/{cluster} dispatches per-cluster catalogs") + } + // Start config reload goroutine if enabled if app.configFile != "" && cfg.ReloadTime > 0 { go app.configReloadLoop(ctx, cmd) @@ -1100,12 +1117,6 @@ func validateOAuthRuntimeConfig(cfg config.Config) error { return nil } - switch cfg.Server.OAuth.NormalizedMode() { - case "forward", "gating": - default: - return fmt.Errorf("unsupported oauth mode: %s", cfg.Server.OAuth.Mode) - } - // M5: refuse to start when the operator points issuer/jwks_url at a // plaintext http:// endpoint. We fetch JWKS and openid-configuration from // these URLs and trust the response to validate signatures — a MitM on @@ -1120,102 +1131,110 @@ func validateOAuthRuntimeConfig(cfg config.Config) error { } signingSecret := strings.TrimSpace(cfg.Server.OAuth.SigningSecret) - if signingSecret == "" { - return fmt.Errorf("oauth signing_secret is required when OAuth is enabled (used for client registration and token exchange in both forward and gating modes)") - } - // Defence in depth: HS256 (gating-mode access token signing) and JWE A256KW - // (client_id + refresh-token wrap) both derive their key as SHA-256(secret). - // SHA-256 spreads bits but doesn't add entropy — a 4-byte secret hashed - // to 32 bytes still has only 32 bits of entropy. 32 bytes is the practical - // minimum to make brute-force forging infeasible. - const minSigningSecretBytes = 32 - if len(signingSecret) < minSigningSecretBytes { - return fmt.Errorf("oauth signing_secret must be at least %d bytes (got %d) — short secrets weaken HS256 and JWE key wrapping; generate with `openssl rand -base64 32` or similar", minSigningSecretBytes, len(signingSecret)) - } - - // Both OAuth modes require HTTP for ClickHouse: forward mode rewrites the - // Authorization header (TCP native protocol has no equivalent); gating - // mode sends Basic base64(email:JWT) to a CH-side http_authentication - // sidecar, which ClickHouse only invokes via the HTTP interface. + + // OAuth requires HTTP for ClickHouse: bearer headers and Basic credentials + // are HTTP auth mechanisms. if cfg.Server.OAuth.Enabled && cfg.ClickHouse.Protocol != config.HTTPProtocol { - return fmt.Errorf("oauth requires clickhouse protocol http (both forward and gating modes route credentials through ClickHouse's HTTP interface)") - } - - // Gating-mode validation has two shapes depending on broker_upstream: - // - // broker_upstream=false (default, #109): MCP is a pure OAuth resource - // server. The external AS (Auth0/CIMD) owns client registration and - // the auth-code flow. Setting any of client_id/client_secret/auth_url/ - // token_url/userinfo_url/public_auth_server_url is a misconfiguration. - // - // broker_upstream=true: MCP also acts as the AS to MCP clients - // (broker-via-MCP), brokering an upstream IdP that does not natively - // support CIMD (e.g. Google). The fields above become REQUIRED. - // The /mcp request path is unchanged from standard gating mode — - // bearer is rewritten to Basic email:JWT for the ch-jwt-verify - // sidecar to validate. Only the OAuth dance changes shape. - if cfg.Server.OAuth.IsGatingMode() { - if !cfg.Server.OAuth.BrokerUpstream { - if cfg.Server.OAuth.ClientID != "" { - return fmt.Errorf("oauth: gating mode (without broker_upstream) forbids oauth.client_id — remove from helm values; client_id is the external AS's responsibility under #109. To act as the AS yourself, set oauth.broker_upstream=true") - } - if cfg.Server.OAuth.ClientSecret != "" { - return fmt.Errorf("oauth: gating mode (without broker_upstream) forbids oauth.client_secret — remove from helm values; client_secret is the external AS's responsibility under #109") - } - if cfg.Server.OAuth.TokenURL != "" { - return fmt.Errorf("oauth: gating mode (without broker_upstream) forbids oauth.token_url — remove from helm values; token_url is the external AS's responsibility under #109") - } - if cfg.Server.OAuth.AuthURL != "" { - return fmt.Errorf("oauth: gating mode (without broker_upstream) forbids oauth.auth_url — remove from helm values; auth_url is the external AS's responsibility under #109") - } - if cfg.Server.OAuth.UserInfoURL != "" { - return fmt.Errorf("oauth: gating mode (without broker_upstream) forbids oauth.userinfo_url — remove from helm values; userinfo_url is the external AS's responsibility under #109") - } - if cfg.Server.OAuth.PublicAuthServerURL != "" { - return fmt.Errorf("oauth: gating mode (without broker_upstream) forbids oauth.public_auth_server_url — remove from helm values; public_auth_server_url is the external AS's responsibility under #109") - } - } else { - // broker_upstream=true: opposite — the broker fields are required. - missing := []string{} - if strings.TrimSpace(cfg.Server.OAuth.ClientID) == "" { - missing = append(missing, "client_id") - } - if strings.TrimSpace(cfg.Server.OAuth.ClientSecret) == "" { - missing = append(missing, "client_secret") - } - if strings.TrimSpace(cfg.Server.OAuth.AuthURL) == "" { - missing = append(missing, "auth_url") - } - if strings.TrimSpace(cfg.Server.OAuth.TokenURL) == "" { - missing = append(missing, "token_url") - } - if len(missing) > 0 { - return fmt.Errorf("oauth: gating mode with broker_upstream=true requires upstream-IdP fields to be set: %s. Without these MCP cannot broker the upstream OAuth dance", strings.Join(missing, ", ")) - } - log.Warn(). - Str("issuer", cfg.Server.OAuth.Issuer). - Str("client_id", cfg.Server.OAuth.ClientID). - Str("auth_url", cfg.Server.OAuth.AuthURL). - Str("token_url", cfg.Server.OAuth.TokenURL). - Msg("oauth: gating mode + broker_upstream=true — altinity-mcp is acting as the OAuth AS to MCP clients, brokering an upstream IdP. /oauth/{register,authorize,callback,token} are exposed. claude.ai/ChatGPT do DCR against this MCP, not against the upstream. Single Google redirect URI: /oauth/callback. This is the post-#109 hybrid; see deploy/otel-google-gating/EXPERIMENT.md.") + return fmt.Errorf("oauth requires clickhouse protocol http") + } + + if cfg.Server.OAuth.Broker { + var missing []string + if strings.TrimSpace(cfg.Server.OAuth.ClientID) == "" { + missing = append(missing, "client_id") + } + if strings.TrimSpace(cfg.Server.OAuth.ClientSecret) == "" { + missing = append(missing, "client_secret") } + if strings.TrimSpace(cfg.Server.OAuth.AuthURL) == "" { + missing = append(missing, "auth_url") + } + if strings.TrimSpace(cfg.Server.OAuth.TokenURL) == "" { + missing = append(missing, "token_url") + } + if len(missing) > 0 { + return fmt.Errorf("oauth: broker=true requires upstream IdP fields: %s", strings.Join(missing, ", ")) + } + if signingSecret == "" { + return fmt.Errorf("oauth signing_secret is required when oauth.broker=true") + } + // Defence in depth: JWE A256KW derives its key from signing_secret. + // SHA-256 spreads bits but doesn't add entropy — a 4-byte secret hashed + // to 32 bytes still has only 32 bits of entropy. 32 bytes is the + // practical minimum to make brute-force forging infeasible. + const minSigningSecretBytes = 32 + if len(signingSecret) < minSigningSecretBytes { + return fmt.Errorf("oauth signing_secret must be at least %d bytes (got %d) — short secrets weaken JWE key wrapping; generate with `openssl rand -base64 32` or similar", minSigningSecretBytes, len(signingSecret)) + } + } else { if strings.TrimSpace(cfg.Server.OAuth.Issuer) == "" { - return fmt.Errorf("oauth: gating mode requires oauth.issuer (the upstream AS, e.g. https://altinity.auth0.com/ or https://accounts.google.com) to be set so MCP can validate JWTs against its JWKS") + return fmt.Errorf("oauth: broker=false requires oauth.issuer (the external AS, e.g. https://altinity.auth0.com/ or https://accounts.google.com)") } if strings.TrimSpace(cfg.Server.OAuth.Audience) == "" { - return fmt.Errorf("oauth: gating mode requires oauth.audience — under !broker_upstream it must byte-equal the MCP public URL (RFC 8707); under broker_upstream it must byte-equal the upstream OAuth client_id (the value in the upstream id_token's `aud` claim)") + return fmt.Errorf("oauth: broker=false requires oauth.audience — it must byte-equal the MCP public resource URL (RFC 8707)") } } return nil } +// validateMulticlusterRuntimeConfig refuses to start the process when +// multi-cluster routing is enabled in a combination that cannot work +// safely. Called from newApplication before any HTTP server is bound. +// +// The checks are intentionally fail-loud rather than fail-quiet: an +// operator misreads "multicluster.enabled: true" alongside JWE auth or +// OpenAPI and ships a deployment that silently breaks per-tenant +// isolation. Earlier failure = clearer signal. +func validateMulticlusterRuntimeConfig(cfg config.Config) error { + if !cfg.Multicluster.Enabled { + return nil + } + if cfg.Server.JWE.Enabled { + return fmt.Errorf("multicluster: JWE is incompatible with multicluster mode (JWE claims carry their own host — bypass cluster routing). Disable server.jwe.enabled or server.multicluster.enabled") + } + if !cfg.Server.OAuth.Enabled { + return fmt.Errorf("multicluster: requires OAuth (server.oauth.enabled=true) — multicluster relies on per-bearer cache keying and per-cluster PRM advertisement") + } + if cfg.Server.OpenAPI.Enabled { + return fmt.Errorf("multicluster: OpenAPI must be disabled in v1 (server.openapi.enabled=false) — per-cluster OpenAPI schema is deferred to v2") + } + if !strings.Contains(cfg.ClickHouse.Host, "{cluster}") { + log.Warn(). + Str("clickhouse.host", cfg.ClickHouse.Host). + Msg("multicluster: clickhouse.host does not contain the {cluster} placeholder — all clusters will route to the same host. Almost certainly a misconfiguration; set host to e.g. chi-{cluster}-{cluster}-0-0.demo") + } + if cfg.Multicluster.CatalogCacheMax < 100 { + return fmt.Errorf("multicluster: catalog_cache_max=%d is too small (min 100) — the cache would thrash under any meaningful number of concurrent users", cfg.Multicluster.CatalogCacheMax) + } + if cfg.Multicluster.CatalogTTLFallback < time.Minute || cfg.Multicluster.CatalogTTLFallback > 24*time.Hour { + return fmt.Errorf("multicluster: catalog_ttl_fallback=%s must be between 1m and 24h", cfg.Multicluster.CatalogTTLFallback) + } + if cfg.Multicluster.CatalogNegativeTTL < 10*time.Second || cfg.Multicluster.CatalogNegativeTTL > 5*time.Minute { + return fmt.Errorf("multicluster: catalog_negative_ttl=%s must be between 10s and 5m", cfg.Multicluster.CatalogNegativeTTL) + } + for _, name := range cfg.Multicluster.ClusterAllowlist { + trimmed := strings.TrimSpace(name) + if trimmed == "" { + continue + } + if !altinitymcp.IsValidClusterName(trimmed) { + return fmt.Errorf("multicluster: cluster_allowlist entry %q is not a valid RFC 1123 DNS label", trimmed) + } + } + return nil +} + func (a *application) Close() { // Stop config reload goroutine if a.configFile != "" { close(a.stopConfigReload) } + if a.mcCache != nil { + a.mcCache.Close() + } + // No resources to close as the ClickHouse client is created and closed per request log.Debug().Msg("Application resources cleaned up") } @@ -1265,6 +1284,17 @@ func (a *application) reloadConfig(cmd CommandInterface) error { // Override with CLI flags overrideWithCLIFlags(newCfg, cmd) + // multicluster.* fields are restart-only: the router + catalog cache + // were bound during newApplication and cannot be safely rebuilt + // mid-flight without dropping in-flight requests. Warn loudly on + // any change so operators see the no-op and know to roll the pod. + a.configMutex.RLock() + oldMC := a.config.Multicluster + a.configMutex.RUnlock() + if !reflect.DeepEqual(oldMC, newCfg.Multicluster) { + log.Warn().Msg("config reload: multicluster.* fields changed — restart required for these to take effect; routing/cache remain on the previous configuration") + } + // Update logging level if changed a.configMutex.Lock() oldLogLevel := a.config.Logging.Level @@ -1315,6 +1345,13 @@ func (a *application) Start() error { // Access the underlying MCPServer from our ClickHouseJWEServer mcpServer := a.mcpServer.MCPServer + if cfg.Multicluster.Enabled { + if cfg.Server.Transport != config.HTTPTransport { + return fmt.Errorf("multicluster mode requires transport: http (got %q) — stdio/sse cannot carry per-cluster routing", cfg.Server.Transport) + } + return a.startMulticlusterHTTPServer(cfg) + } + switch cfg.Server.Transport { case config.StdioTransport: return a.startSTDIOServer(mcpServer) @@ -1329,3 +1366,112 @@ func (a *application) Start() error { return fmt.Errorf("unsupported transport type: %s", cfg.Server.Transport) } } + +// startMulticlusterHTTPServer wires up the HTTP server for multi-cluster +// mode (cfg.Multicluster.Enabled). Routes: +// +// GET /health, /livez, /oauth/*, /.well-known/oauth-protected-resource +// GET /.well-known/oauth-protected-resource/mcp/{cluster} +// GET /mcp/{cluster}/.well-known/oauth-protected-resource +// * /mcp/{cluster} ← multi-cluster MCP entrypoint +// +// /mcp/{cluster} chain (outer to inner): +// - corsHandler, stripTrailingSlash (mux-wide) +// - mcRouter.Middleware: extracts {cluster}, validates, expands host, +// injects (cluster, reqCfg) on ctx. +// - createMCPAuthInjector: existing OAuth bearer extraction. +// - serverInjector (existing): MCP server on ctx. +// - StreamableHTTPHandler with MulticlusterServerFactory.GetServer: +// per-(bearer,cluster) fresh *mcp.Server populated with cached tools. +// +// dynamicToolsInjector is bypassed in MC mode: a global EnsureDynamicTools +// would poison cross-tenant catalogs. The MulticlusterServerFactory +// consults the catalog cache for the right (bearer, cluster) entry on +// each request and builds a fresh server. +func (a *application) startMulticlusterHTTPServer(cfg config.Config) error { + addr := fmt.Sprintf("%s:%d", cfg.Server.Address, cfg.Server.Port) + log.Info(). + Str("address", addr). + Msg("Starting MCP server with multi-cluster HTTP transport") + + corsHandler := func(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Access-Control-Allow-Origin", cfg.Server.CORSOrigin) + w.Header().Set("Access-Control-Allow-Methods", "GET, POST, PUT, DELETE, OPTIONS") + w.Header().Set("Access-Control-Allow-Headers", "Content-Type, Authorization, X-Altinity-MCP-Key, Mcp-Protocol-Version, Referer, User-Agent") + w.Header().Set("Access-Control-Max-Age", "86400") + if r.Method == "OPTIONS" { + w.WriteHeader(http.StatusOK) + return + } + next.ServeHTTP(w, r) + }) + } + + authInjector := a.createMCPAuthInjector(cfg) + serverInjector := func(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + ctx := context.WithValue(r.Context(), altinitymcp.CHJWEServerKey, a.mcpServer) + next.ServeHTTP(w, r.WithContext(ctx)) + }) + } + + factory := altinitymcp.NewMulticlusterServerFactory(cfg, a.mcpServer, a.mcCache, version) + sdkHandler := mcp.NewStreamableHTTPHandler(factory.GetServer, &mcp.StreamableHTTPOptions{Stateless: true}) + + mux := http.NewServeMux() + mux.HandleFunc("/health", a.healthHandler) + mux.HandleFunc("/livez", a.livenessHandler) + a.registerOAuthHTTPRoutes(mux) + a.registerMulticlusterPRMRoutes(mux) + + mcpHandler := a.mcRouter.Middleware(authInjector(serverInjector(sdkHandler))) + mux.Handle("/mcp/{cluster}", mcpHandler) + mux.Handle("/mcp/{cluster}/", mcpHandler) + + httpHandler := stripTrailingSlash(corsHandler(mux)) + + a.setHTTPServer(&http.Server{ + Addr: addr, + Handler: httpHandler, + }) + + return a.startHTTPServerWithTLS(cfg, addr, "http") +} + +// registerMulticlusterPRMRoutes registers the per-cluster RFC 9728 PRM +// routes so claude.ai / ChatGPT can auto-discover the protected-resource +// metadata when the user types just the /mcp/{cluster} URL into the +// connector form. +// +// Three routes share the same handler (`handlePRMCluster`): +// +// /.well-known/oauth-protected-resource/mcp/{cluster} ← RFC 9728 §3.1 +// /mcp/{cluster}/.well-known/oauth-protected-resource ← MCP-client probe path +// +// (The host-root /.well-known/oauth-protected-resource is owned by the +// existing registerOAuthHTTPRoutes — we don't override it.) +// +// In v1 the body is identical to the host-root PRM (shared audience). +// Per-cluster audience is v2. +func (a *application) registerMulticlusterPRMRoutes(mux *http.ServeMux) { + mux.HandleFunc("GET /.well-known/oauth-protected-resource/mcp/{cluster}", a.handlePRMCluster) + mux.HandleFunc("GET /mcp/{cluster}/.well-known/oauth-protected-resource", a.handlePRMCluster) +} + +func (a *application) handlePRMCluster(w http.ResponseWriter, r *http.Request) { + cluster := r.PathValue("cluster") + if !altinitymcp.IsValidClusterName(cluster) { + http.NotFound(w, r) + return + } + if a.mcRouter == nil { + http.NotFound(w, r) + return + } + if _, ok := a.mcRouter.ValidateClusterAllowed(cluster); !ok { + http.NotFound(w, r) + return + } + a.handleOAuthProtectedResource(w, r) +} diff --git a/cmd/altinity-mcp/main_test.go b/cmd/altinity-mcp/main_test.go index 12905624..cf0c1a49 100644 --- a/cmd/altinity-mcp/main_test.go +++ b/cmd/altinity-mcp/main_test.go @@ -364,6 +364,26 @@ func (m *mockCommand) IsSet(name string) bool { return m.setFlags[name] } +func TestBuildConfigAppliesMulticlusterDefaultsAfterFlags(t *testing.T) { + t.Parallel() + cmd := &mockCommand{ + flags: map[string]interface{}{ + "multicluster-enabled": true, + }, + setFlags: map[string]bool{ + "multicluster-enabled": true, + }, + stringMaps: make(map[string]map[string]string), + } + + cfg, err := buildConfig(cmd) + require.NoError(t, err) + require.True(t, cfg.Multicluster.Enabled) + require.Equal(t, 10000, cfg.Multicluster.CatalogCacheMax) + require.Equal(t, 15*time.Minute, cfg.Multicluster.CatalogTTLFallback) + require.Equal(t, 60*time.Second, cfg.Multicluster.CatalogNegativeTTL) +} + // TestBuildServerTLSConfig tests server TLS configuration building func TestBuildServerTLSConfig(t *testing.T) { t.Parallel() @@ -3228,197 +3248,166 @@ func generateSelfSignedCert() ([]byte, []byte, error) { func TestValidateOAuthRuntimeConfig(t *testing.T) { t.Parallel() + resourceServerBase := func() config.OAuthConfig { + return config.OAuthConfig{ + Enabled: true, + Issuer: "https://altinity.auth0.com/", + Audience: "https://example-mcp.test/", + } + } + brokerBase := func() config.OAuthConfig { + return config.OAuthConfig{ + Enabled: true, + Broker: true, + SigningSecret: "test-signing-secret-32-byte-key!!", + Issuer: "https://accounts.google.com", + Audience: "some-google-client-id.apps.googleusercontent.com", + ClientID: "some-google-client-id.apps.googleusercontent.com", + ClientSecret: "GOCSPX-redacted", + AuthURL: "https://accounts.google.com/o/oauth2/v2/auth", + TokenURL: "https://oauth2.googleapis.com/token", + } + } + t.Run("disabled_returns_nil", func(t *testing.T) { t.Parallel() cfg := config.Config{Server: config.ServerConfig{OAuth: config.OAuthConfig{Enabled: false}}} require.NoError(t, validateOAuthRuntimeConfig(cfg)) }) - t.Run("unsupported_mode", func(t *testing.T) { + t.Run("resource_server_requires_http", func(t *testing.T) { t.Parallel() - cfg := config.Config{Server: config.ServerConfig{OAuth: config.OAuthConfig{ - Enabled: true, - Mode: "custom", - SigningSecret: "test-signing-secret-32-byte-key!!", - }}} + cfg := config.Config{ + Server: config.ServerConfig{OAuth: resourceServerBase()}, + ClickHouse: config.ClickHouseConfig{Protocol: config.TCPProtocol}, + } err := validateOAuthRuntimeConfig(cfg) - require.Error(t, err) - require.Contains(t, err.Error(), "unsupported oauth mode") + require.ErrorContains(t, err, "requires clickhouse protocol http") }) - t.Run("short_signing_secret_rejected", func(t *testing.T) { + t.Run("valid_resource_server_config", func(t *testing.T) { t.Parallel() - cfg := config.Config{Server: config.ServerConfig{OAuth: config.OAuthConfig{ - Enabled: true, - Mode: "gating", - SigningSecret: "too-short", - }}} - err := validateOAuthRuntimeConfig(cfg) - require.Error(t, err) - require.Contains(t, err.Error(), "at least 32 bytes") - }) - - t.Run("missing_gating_secret", func(t *testing.T) { - t.Parallel() - cfg := config.Config{Server: config.ServerConfig{OAuth: config.OAuthConfig{ - Enabled: true, - Mode: "gating", - SigningSecret: "", - }}} - err := validateOAuthRuntimeConfig(cfg) - require.Error(t, err) - require.Contains(t, err.Error(), "signing_secret is required") + cfg := config.Config{ + Server: config.ServerConfig{OAuth: resourceServerBase()}, + ClickHouse: config.ClickHouseConfig{Protocol: config.HTTPProtocol}, + } + require.NoError(t, validateOAuthRuntimeConfig(cfg)) }) - t.Run("forward_mode_requires_http", func(t *testing.T) { + t.Run("resource_server_requires_issuer", func(t *testing.T) { t.Parallel() + o := resourceServerBase() + o.Issuer = "" cfg := config.Config{ - Server: config.ServerConfig{OAuth: config.OAuthConfig{ - Enabled: true, - Mode: "forward", - SigningSecret: "test-signing-secret-32-byte-key!!", - }}, - ClickHouse: config.ClickHouseConfig{Protocol: config.TCPProtocol}, + Server: config.ServerConfig{OAuth: o}, + ClickHouse: config.ClickHouseConfig{Protocol: config.HTTPProtocol}, } err := validateOAuthRuntimeConfig(cfg) - require.Error(t, err) - require.Contains(t, err.Error(), "requires clickhouse protocol http") + require.ErrorContains(t, err, "broker=false requires oauth.issuer") }) - t.Run("valid_gating_config", func(t *testing.T) { + t.Run("resource_server_requires_audience", func(t *testing.T) { t.Parallel() + o := resourceServerBase() + o.Audience = "" cfg := config.Config{ - Server: config.ServerConfig{OAuth: config.OAuthConfig{ - Enabled: true, - Mode: "gating", - SigningSecret: "test-signing-secret-32-byte-key!!", - Issuer: "https://example.auth0.com/", - Audience: "https://example-mcp.test/", - }}, + Server: config.ServerConfig{OAuth: o}, ClickHouse: config.ClickHouseConfig{Protocol: config.HTTPProtocol}, } - require.NoError(t, validateOAuthRuntimeConfig(cfg)) + err := validateOAuthRuntimeConfig(cfg) + require.ErrorContains(t, err, "broker=false requires oauth.audience") }) - t.Run("valid_forward_config", func(t *testing.T) { + t.Run("broker_valid_full_config", func(t *testing.T) { t.Parallel() cfg := config.Config{ - Server: config.ServerConfig{OAuth: config.OAuthConfig{ - Enabled: true, - Mode: "forward", - SigningSecret: "test-signing-secret-32-byte-key!!", - }}, + Server: config.ServerConfig{OAuth: brokerBase()}, ClickHouse: config.ClickHouseConfig{Protocol: config.HTTPProtocol}, } require.NoError(t, validateOAuthRuntimeConfig(cfg)) }) - // Gating-mode with broker_upstream: opt-in DCR-via-MCP hybrid. The pure- - // resource-server gating shape stays the default; broker shape unlocks - // the AS handlers and requires the upstream-IdP fields to be present. - gatingBrokerBase := func() config.OAuthConfig { - return config.OAuthConfig{ - Enabled: true, - Mode: "gating", - SigningSecret: "test-signing-secret-32-byte-key!!", - Issuer: "https://accounts.google.com", - Audience: "some-google-client-id.apps.googleusercontent.com", - BrokerUpstream: true, - ClientID: "some-google-client-id.apps.googleusercontent.com", - ClientSecret: "GOCSPX-redacted", - AuthURL: "https://accounts.google.com/o/oauth2/v2/auth", - TokenURL: "https://oauth2.googleapis.com/token", - UserInfoURL: "https://openidconnect.googleapis.com/v1/userinfo", - } - } - - t.Run("gating_broker_valid_full_config", func(t *testing.T) { + t.Run("broker_requires_http", func(t *testing.T) { t.Parallel() cfg := config.Config{ - Server: config.ServerConfig{OAuth: gatingBrokerBase()}, - ClickHouse: config.ClickHouseConfig{Protocol: config.HTTPProtocol}, + Server: config.ServerConfig{OAuth: brokerBase()}, + ClickHouse: config.ClickHouseConfig{Protocol: config.TCPProtocol}, } - require.NoError(t, validateOAuthRuntimeConfig(cfg)) + err := validateOAuthRuntimeConfig(cfg) + require.ErrorContains(t, err, "requires clickhouse protocol http") }) - t.Run("gating_broker_missing_client_id_rejected", func(t *testing.T) { + t.Run("broker_missing_signing_secret_rejected", func(t *testing.T) { t.Parallel() - o := gatingBrokerBase() - o.ClientID = "" + o := brokerBase() + o.SigningSecret = "" cfg := config.Config{ Server: config.ServerConfig{OAuth: o}, ClickHouse: config.ClickHouseConfig{Protocol: config.HTTPProtocol}, } err := validateOAuthRuntimeConfig(cfg) - require.ErrorContains(t, err, "broker_upstream=true requires upstream-IdP fields") - require.ErrorContains(t, err, "client_id") + require.ErrorContains(t, err, "signing_secret is required") }) - t.Run("gating_broker_missing_client_secret_rejected", func(t *testing.T) { + t.Run("broker_short_signing_secret_rejected", func(t *testing.T) { t.Parallel() - o := gatingBrokerBase() - o.ClientSecret = "" + o := brokerBase() + o.SigningSecret = "too-short" cfg := config.Config{ Server: config.ServerConfig{OAuth: o}, ClickHouse: config.ClickHouseConfig{Protocol: config.HTTPProtocol}, } err := validateOAuthRuntimeConfig(cfg) - require.ErrorContains(t, err, "client_secret") + require.ErrorContains(t, err, "at least 32 bytes") }) - t.Run("gating_broker_missing_auth_url_rejected", func(t *testing.T) { + t.Run("broker_missing_client_id_rejected", func(t *testing.T) { t.Parallel() - o := gatingBrokerBase() - o.AuthURL = "" + o := brokerBase() + o.ClientID = "" cfg := config.Config{ Server: config.ServerConfig{OAuth: o}, ClickHouse: config.ClickHouseConfig{Protocol: config.HTTPProtocol}, } err := validateOAuthRuntimeConfig(cfg) - require.ErrorContains(t, err, "auth_url") + require.ErrorContains(t, err, "broker=true requires") + require.ErrorContains(t, err, "client_id") }) - t.Run("gating_broker_missing_token_url_rejected", func(t *testing.T) { + t.Run("broker_missing_client_secret_rejected", func(t *testing.T) { t.Parallel() - o := gatingBrokerBase() - o.TokenURL = "" + o := brokerBase() + o.ClientSecret = "" cfg := config.Config{ Server: config.ServerConfig{OAuth: o}, ClickHouse: config.ClickHouseConfig{Protocol: config.HTTPProtocol}, } err := validateOAuthRuntimeConfig(cfg) - require.ErrorContains(t, err, "token_url") + require.ErrorContains(t, err, "client_secret") }) - t.Run("gating_non_broker_still_forbids_client_id", func(t *testing.T) { + t.Run("broker_missing_auth_url_rejected", func(t *testing.T) { t.Parallel() - // Default gating (broker_upstream=false) must still reject the - // upstream-IdP fields per #109; broker_upstream is an explicit opt-in. + o := brokerBase() + o.AuthURL = "" cfg := config.Config{ - Server: config.ServerConfig{OAuth: config.OAuthConfig{ - Enabled: true, - Mode: "gating", - SigningSecret: "test-signing-secret-32-byte-key!!", - Issuer: "https://altinity.auth0.com/", - Audience: "https://example-mcp.test/", - ClientID: "some-client", - }}, + Server: config.ServerConfig{OAuth: o}, ClickHouse: config.ClickHouseConfig{Protocol: config.HTTPProtocol}, } err := validateOAuthRuntimeConfig(cfg) - require.ErrorContains(t, err, "without broker_upstream") + require.ErrorContains(t, err, "auth_url") }) - t.Run("gating_broker_still_requires_audience", func(t *testing.T) { + t.Run("broker_missing_token_url_rejected", func(t *testing.T) { t.Parallel() - o := gatingBrokerBase() - o.Audience = "" + o := brokerBase() + o.TokenURL = "" cfg := config.Config{ Server: config.ServerConfig{OAuth: o}, ClickHouse: config.ClickHouseConfig{Protocol: config.HTTPProtocol}, } err := validateOAuthRuntimeConfig(cfg) - require.ErrorContains(t, err, "oauth.audience") + require.ErrorContains(t, err, "token_url") }) } @@ -3430,28 +3419,6 @@ func TestWarnOAuthMisconfiguration(t *testing.T) { // Should not panic warnOAuthMisconfiguration(config.Config{Server: config.ServerConfig{OAuth: config.OAuthConfig{Enabled: false}}}) }) - - t.Run("gating_mode_missing_public_auth_server_url", func(t *testing.T) { - t.Parallel() - // Should log warning but not panic - warnOAuthMisconfiguration(config.Config{Server: config.ServerConfig{OAuth: config.OAuthConfig{ - Enabled: true, - Mode: "gating", - Issuer: "https://issuer.example.com", - PublicAuthServerURL: "", - }}}) - }) - - t.Run("gating_mode_with_public_auth_server_url", func(t *testing.T) { - t.Parallel() - // Should not warn - warnOAuthMisconfiguration(config.Config{Server: config.ServerConfig{OAuth: config.OAuthConfig{ - Enabled: true, - Mode: "gating", - Issuer: "https://issuer.example.com", - PublicAuthServerURL: "https://public.example.com", - }}}) - }) } func TestTransportRoutePatterns(t *testing.T) { @@ -3698,11 +3665,11 @@ func TestLivenessHandler(t *testing.T) { }) } -func TestHealthHandler_OAuthForwardMode(t *testing.T) { +func TestHealthHandler_OAuth(t *testing.T) { t.Parallel() cfg := config.Config{ Server: config.ServerConfig{ - OAuth: config.OAuthConfig{Enabled: true, Mode: "forward"}, + OAuth: config.OAuthConfig{Enabled: true}, }, } app := &application{ diff --git a/cmd/altinity-mcp/multicluster_validate_test.go b/cmd/altinity-mcp/multicluster_validate_test.go new file mode 100644 index 00000000..73a04162 --- /dev/null +++ b/cmd/altinity-mcp/multicluster_validate_test.go @@ -0,0 +1,116 @@ +package main + +import ( + "testing" + "time" + + "github.com/altinity/altinity-mcp/pkg/config" + "github.com/stretchr/testify/require" +) + +func TestValidateMulticlusterRuntimeConfig(t *testing.T) { + t.Parallel() + + baseValid := func() config.Config { + return config.Config{ + ClickHouse: config.ClickHouseConfig{ + Host: "chi-{cluster}-{cluster}-0-0.demo", + Port: 8443, + Protocol: config.HTTPProtocol, + }, + Server: config.ServerConfig{ + OAuth: config.OAuthConfig{Enabled: true, SigningSecret: "x"}, + }, + Multicluster: config.MulticlusterConfig{ + Enabled: true, + ClusterAllowlist: []string{"otel", "antalya"}, + CatalogCacheMax: 1000, + CatalogTTLFallback: 15 * time.Minute, + CatalogNegativeTTL: 60 * time.Second, + }, + } + } + + t.Run("disabled_returns_nil", func(t *testing.T) { + t.Parallel() + var cfg config.Config + require.NoError(t, validateMulticlusterRuntimeConfig(cfg)) + }) + + t.Run("valid_passes", func(t *testing.T) { + t.Parallel() + require.NoError(t, validateMulticlusterRuntimeConfig(baseValid())) + }) + + t.Run("rejects_jwe", func(t *testing.T) { + t.Parallel() + cfg := baseValid() + cfg.Server.JWE.Enabled = true + err := validateMulticlusterRuntimeConfig(cfg) + require.Error(t, err) + require.Contains(t, err.Error(), "JWE is incompatible") + }) + + t.Run("requires_oauth", func(t *testing.T) { + t.Parallel() + cfg := baseValid() + cfg.Server.OAuth.Enabled = false + err := validateMulticlusterRuntimeConfig(cfg) + require.Error(t, err) + require.Contains(t, err.Error(), "requires OAuth") + }) + + t.Run("rejects_openapi", func(t *testing.T) { + t.Parallel() + cfg := baseValid() + cfg.Server.OpenAPI.Enabled = true + err := validateMulticlusterRuntimeConfig(cfg) + require.Error(t, err) + require.Contains(t, err.Error(), "OpenAPI must be disabled") + }) + + t.Run("rejects_tiny_cache", func(t *testing.T) { + t.Parallel() + cfg := baseValid() + cfg.Multicluster.CatalogCacheMax = 50 + err := validateMulticlusterRuntimeConfig(cfg) + require.Error(t, err) + require.Contains(t, err.Error(), "catalog_cache_max") + }) + + t.Run("rejects_out_of_range_fallback", func(t *testing.T) { + t.Parallel() + cfg := baseValid() + cfg.Multicluster.CatalogTTLFallback = 30 * time.Second + err := validateMulticlusterRuntimeConfig(cfg) + require.Error(t, err) + require.Contains(t, err.Error(), "catalog_ttl_fallback") + }) + + t.Run("rejects_out_of_range_negative_ttl", func(t *testing.T) { + t.Parallel() + cfg := baseValid() + cfg.Multicluster.CatalogNegativeTTL = 10 * time.Minute + err := validateMulticlusterRuntimeConfig(cfg) + require.Error(t, err) + require.Contains(t, err.Error(), "catalog_negative_ttl") + }) + + t.Run("rejects_bad_allowlist_entry", func(t *testing.T) { + t.Parallel() + cfg := baseValid() + cfg.Multicluster.ClusterAllowlist = []string{"good", "Bad-Caps"} + err := validateMulticlusterRuntimeConfig(cfg) + require.Error(t, err) + require.Contains(t, err.Error(), "Bad-Caps") + }) + + t.Run("warns_on_missing_placeholder", func(t *testing.T) { + t.Parallel() + // Warn only — does not error. Existing structured log captures + // this; we just verify the validator returns nil. + cfg := baseValid() + cfg.ClickHouse.Host = "chi-otel-otel-0-0.demo" + require.NoError(t, validateMulticlusterRuntimeConfig(cfg)) + }) +} diff --git a/cmd/altinity-mcp/oauth_forward_refresh_test.go b/cmd/altinity-mcp/oauth_forward_refresh_test.go index 660e7b47..dc5569ce 100644 --- a/cmd/altinity-mcp/oauth_forward_refresh_test.go +++ b/cmd/altinity-mcp/oauth_forward_refresh_test.go @@ -22,14 +22,13 @@ import ( "github.com/altinity/go-mcp-oauth-sdk/broker" ) -// Tests for issue #121: forward-mode id_token refresh. +// Tests for issue #121: broker id_token refresh. // -// In forward mode the bearer the MCP client receives is the upstream -// id_token itself. Google's silent-SSO can return a cached id_token whose -// `exp` is set from the original mint time, leaving the MCP client with -// only minutes of session even though the access_token says 1h. The fix -// uses the upstream refresh_token at /token to mint a fresh id_token -// before forwarding. +// In broker mode the bearer the MCP client receives is the upstream id_token +// itself. Google's silent-SSO can return a cached id_token whose `exp` is set +// from the original mint time, leaving the MCP client with only minutes of +// session even though the access_token says 1h. The fix uses the upstream +// refresh_token at /token to mint a fresh id_token before returning the bearer. // --- /authorize: access_type=offline added when upstream_offline_access --- @@ -56,7 +55,7 @@ func TestOAuthAuthorize_OfflineAccessParams(t *testing.T) { t.Parallel() cfg := config.Config{Server: config.ServerConfig{OAuth: config.OAuthConfig{ Enabled: true, - Mode: "forward", + Broker: true, Issuer: tc.issuer, AuthURL: "https://idp.example.com/authorize", TokenURL: "https://idp.example.com/token", @@ -236,7 +235,7 @@ func runTokenExchange(t *testing.T, app *application, cimdURL, redirectURI strin return rr } -func buildForwardModeApp(t *testing.T, upstream *refreshProbeUpstream, cimdURL, redirectURI string) *application { +func buildBrokerApp(t *testing.T, upstream *refreshProbeUpstream, cimdURL, redirectURI string) *application { t.Helper() cimdServer := httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/json") @@ -245,7 +244,7 @@ func buildForwardModeApp(t *testing.T, upstream *refreshProbeUpstream, cimdURL, t.Cleanup(cimdServer.Close) cfg := config.Config{Server: config.ServerConfig{OAuth: config.OAuthConfig{ Enabled: true, - Mode: "forward", + Broker: true, Issuer: upstream.server.URL, JWKSURL: upstream.server.URL + "/jwks", AuthURL: upstream.server.URL + "/authorize", @@ -269,7 +268,7 @@ func TestOAuthToken_RefreshesNearExpiredIDToken(t *testing.T) { nearExp := time.Now().Add(2 * time.Minute) freshExp := time.Now().Add(60 * time.Minute) upstream := newRefreshProbeUpstream(t, nearExp, freshExp, "alice@example.com") - app := buildForwardModeApp(t, upstream, "https://demo.example.com/cimd.json", "https://demo.example.com/cb") + app := buildBrokerApp(t, upstream, "https://demo.example.com/cimd.json", "https://demo.example.com/cb") rr := runTokenExchange(t, app, "https://demo.example.com/cimd.json", "https://demo.example.com/cb") require.Equal(t, http.StatusOK, rr.Code, "body=%s", rr.Body.String()) @@ -291,7 +290,7 @@ func TestOAuthToken_SkipsRefreshWhenIDTokenFresh(t *testing.T) { nearExp := time.Now().Add(57 * time.Minute) freshExp := time.Now().Add(60 * time.Minute) upstream := newRefreshProbeUpstream(t, nearExp, freshExp, "bob@example.com") - app := buildForwardModeApp(t, upstream, "https://demo.example.com/cimd.json", "https://demo.example.com/cb") + app := buildBrokerApp(t, upstream, "https://demo.example.com/cimd.json", "https://demo.example.com/cb") rr := runTokenExchange(t, app, "https://demo.example.com/cimd.json", "https://demo.example.com/cb") require.Equal(t, http.StatusOK, rr.Code, "body=%s", rr.Body.String()) @@ -343,7 +342,7 @@ func TestOAuthToken_RefreshFailureSoftFallsBack(t *testing.T) { })) t.Cleanup(cimdServer.Close) cfg := config.Config{Server: config.ServerConfig{OAuth: config.OAuthConfig{ - Enabled: true, Mode: "forward", Issuer: srv.URL, JWKSURL: srv.URL + "/jwks", + Enabled: true, Broker: true, Issuer: srv.URL, JWKSURL: srv.URL + "/jwks", AuthURL: srv.URL + "/authorize", TokenURL: srv.URL + "/token", ClientID: "broker", ClientSecret: "s", PublicAuthServerURL: "https://mcp.example.com", SigningSecret: "regression-softfail-32bytes!!!!!", UpstreamOfflineAccess: true, @@ -358,11 +357,9 @@ func TestOAuthToken_RefreshFailureSoftFallsBack(t *testing.T) { require.NotEmpty(t, body["access_token"], "must still hand back the original id_token as bearer") } -// Gating-with-broker_upstream deployments (github-mcp, otel-google-gating-mcp) -// also return the upstream id_token as the bearer, so the same refresh logic -// must apply. Guards against regression to the original forward-mode-only -// gate which left gating+broker silently broken. -func TestOAuthToken_RefreshesNearExpiredIDToken_GatingBrokerUpstream(t *testing.T) { +// Broker deployments return the upstream id_token as the bearer, so the refresh +// logic must apply. +func TestOAuthToken_RefreshesNearExpiredIDToken_Broker(t *testing.T) { t.Parallel() nearExp := time.Now().Add(2 * time.Minute) freshExp := time.Now().Add(60 * time.Minute) @@ -375,17 +372,16 @@ func TestOAuthToken_RefreshesNearExpiredIDToken_GatingBrokerUpstream(t *testing. t.Cleanup(cimdServer.Close) cfg := config.Config{Server: config.ServerConfig{OAuth: config.OAuthConfig{ Enabled: true, - Mode: "gating", - BrokerUpstream: true, Issuer: upstream.server.URL, + Broker: true, JWKSURL: upstream.server.URL + "/jwks", AuthURL: upstream.server.URL + "/authorize", TokenURL: upstream.server.URL + "/token", ClientID: "broker", ClientSecret: "s", - Audience: "broker", // matches client_id under broker_upstream + Audience: "broker", PublicAuthServerURL: "https://mcp.example.com", - SigningSecret: "regression-refresh-gating-32b!!!!", + SigningSecret: "regression-refresh-broker-32b!!!!", UpstreamOfflineAccess: true, }}} app := &application{ @@ -397,13 +393,13 @@ func TestOAuthToken_RefreshesNearExpiredIDToken_GatingBrokerUpstream(t *testing. require.Equal(t, http.StatusOK, rr.Code, "body=%s", rr.Body.String()) require.Equal(t, int32(1), atomic.LoadInt32(&upstream.codeExchangeCt)) require.Equal(t, int32(1), atomic.LoadInt32(&upstream.refreshCt), - "gating+broker_upstream MUST trigger refresh on near-expired id_token (#121)") + "broker mode MUST trigger refresh on near-expired id_token (#121)") } func TestOAuthToken_NoRefreshWhenUpstreamReturnsNoRefreshToken(t *testing.T) { t.Parallel() // upstream returns near-expired id_token but no refresh_token. We must - // NOT attempt refresh (would 400 or call with empty token), just forward. + // NOT attempt refresh (would 400 or call with empty token), just return it. nearExp := time.Now().Add(3 * time.Minute) priv, err := rsa.GenerateKey(rand.Reader, 2048) require.NoError(t, err) @@ -446,7 +442,7 @@ func TestOAuthToken_NoRefreshWhenUpstreamReturnsNoRefreshToken(t *testing.T) { })) t.Cleanup(cimdServer.Close) cfg := config.Config{Server: config.ServerConfig{OAuth: config.OAuthConfig{ - Enabled: true, Mode: "forward", Issuer: srv.URL, JWKSURL: srv.URL + "/jwks", + Enabled: true, Broker: true, Issuer: srv.URL, JWKSURL: srv.URL + "/jwks", AuthURL: srv.URL + "/authorize", TokenURL: srv.URL + "/token", ClientID: "broker", ClientSecret: "s", PublicAuthServerURL: "https://mcp.example.com", SigningSecret: "regression-no-rt-32bytes!!!!!!!!", UpstreamOfflineAccess: true, @@ -464,7 +460,7 @@ func TestOAuthToken_RefreshFallsBackWhenUpstreamReturnsNoIDToken(t *testing.T) { t.Parallel() // Initial code exchange returns near-expired id_token + refresh_token. // Refresh-token grant returns 200 OK with only an access_token (no id_token). - // Must soft-fail: forward original near-expired id_token, /token returns 200. + // Must soft-fail: return original near-expired id_token, /token returns 200. nearExp := time.Now().Add(3 * time.Minute) priv, err := rsa.GenerateKey(rand.Reader, 2048) require.NoError(t, err) @@ -507,7 +503,7 @@ func TestOAuthToken_RefreshFallsBackWhenUpstreamReturnsNoIDToken(t *testing.T) { })) t.Cleanup(cimdServer.Close) cfg := config.Config{Server: config.ServerConfig{OAuth: config.OAuthConfig{ - Enabled: true, Mode: "forward", Issuer: srv.URL, JWKSURL: srv.URL + "/jwks", + Enabled: true, Broker: true, Issuer: srv.URL, JWKSURL: srv.URL + "/jwks", AuthURL: srv.URL + "/authorize", TokenURL: srv.URL + "/token", ClientID: "broker", ClientSecret: "s", PublicAuthServerURL: "https://mcp.example.com", SigningSecret: "regression-no-idtok-32bytes!!!!!", UpstreamOfflineAccess: true, @@ -518,7 +514,7 @@ func TestOAuthToken_RefreshFallsBackWhenUpstreamReturnsNoIDToken(t *testing.T) { require.Equal(t, http.StatusOK, rr.Code, "missing id_token on refresh must soft-fail; body=%s", rr.Body.String()) var body map[string]interface{} require.NoError(t, json.Unmarshal(rr.Body.Bytes(), &body)) - require.NotEmpty(t, body["access_token"], "must still forward original id_token") + require.NotEmpty(t, body["access_token"], "must still return original id_token") } // --- soft-fail: upstream returns RFC 6749 error_description --------------- @@ -539,7 +535,7 @@ func TestOAuthToken_RefreshErrorDescriptionSurfacedInLog(t *testing.T) { })) t.Cleanup(upstream.Close) cfg := config.Config{Server: config.ServerConfig{OAuth: config.OAuthConfig{ - Enabled: true, Mode: "forward", Issuer: upstream.URL, JWKSURL: upstream.URL + "/jwks", + Enabled: true, Broker: true, Issuer: upstream.URL, JWKSURL: upstream.URL + "/jwks", AuthURL: upstream.URL + "/authorize", TokenURL: upstream.URL + "/token", ClientID: "broker", ClientSecret: "s", PublicAuthServerURL: "https://mcp.example.com", SigningSecret: "regression-errdesc-32bytes!!!!!!", UpstreamOfflineAccess: true, @@ -569,7 +565,7 @@ func TestOAuthAuthorize_ForceConsentFlag(t *testing.T) { t.Parallel() cfg := config.Config{Server: config.ServerConfig{OAuth: config.OAuthConfig{ Enabled: true, - Mode: "forward", + Broker: true, Issuer: "https://accounts.google.com", AuthURL: "https://accounts.google.com/o/oauth2/v2/auth", TokenURL: "https://oauth2.googleapis.com/token", diff --git a/cmd/altinity-mcp/oauth_ha_replay_test.go b/cmd/altinity-mcp/oauth_ha_replay_test.go index 1f820663..9bb4f9a9 100644 --- a/cmd/altinity-mcp/oauth_ha_replay_test.go +++ b/cmd/altinity-mcp/oauth_ha_replay_test.go @@ -88,7 +88,6 @@ func TestHAReplay_UpstreamInvalidGrantOnReplay(t *testing.T) { Server: config.ServerConfig{ OAuth: config.OAuthConfig{ Enabled: true, - Mode: "forward", Issuer: upstream.URL, JWKSURL: upstream.URL + "/jwks", AuthURL: upstream.URL + "/authorize", diff --git a/cmd/altinity-mcp/oauth_regression_test.go b/cmd/altinity-mcp/oauth_regression_test.go index b43f17ad..cb40f92d 100644 --- a/cmd/altinity-mcp/oauth_regression_test.go +++ b/cmd/altinity-mcp/oauth_regression_test.go @@ -15,6 +15,7 @@ import ( "context" "crypto/rand" "crypto/rsa" + "encoding/base64" "encoding/json" "fmt" "io" @@ -165,31 +166,29 @@ func TestOAuthPendingAuthAndAuthCodeRoundTrip(t *testing.T) { }) } -// --- MCP auth injector is a pure forwarder ------------------------------ +// --- MCP auth injector passes bearer context ---------------------------- -// The refactor moved per-request JWT validation to the CH-side ch-jwt-verify -// sidecar (gating mode) / Antalya's token_processors (forward mode). MCP's -// injector now just confirms a bearer is present and threads it into the -// context — anything else is the CH-side authenticator's responsibility. +// Per-request JWT validation is handled by the CH-side ch-jwt-verify sidecar or +// Antalya's token_processors. MCP's injector confirms a bearer is present and +// threads it into the context; the CH-side authenticator owns the rest. // // Note for security reviewers: the previous version of this test asserted // that wrong-audience / expired / unsigned JWTs were rejected at the MCP -// edge with HTTP 401. That protection now lives at the data-plane gate: -// - gating mode: the ch-jwt-verify sidecar (github.com/altinity/altinity-oauth-helper) +// edge with HTTP 401. That protection now lives at the data plane: +// - broker=false: the ch-jwt-verify sidecar (github.com/altinity/altinity-oauth-helper) // covers signature, aud byte-equal, exp/nbf, scope, identity policy, // user-vs-claim match. -// - forward mode: ClickHouse's re-validates the same +// - OAuth: ClickHouse's re-validates the same // JWT against the upstream JWKS on every query. // // The MCP layer intentionally does not duplicate those checks. -func TestOAuthMCPAuthInjectorForwardsBearer(t *testing.T) { +func TestOAuthMCPAuthInjectorPassesBearerContext(t *testing.T) { t.Parallel() provider := newRegressionOIDCProvider(t, nil, nil) cfg := config.Config{ Server: config.ServerConfig{ OAuth: config.OAuthConfig{ Enabled: true, - Mode: "forward", Issuer: provider.server.URL, JWKSURL: provider.server.URL + "/jwks", Audience: "clickhouse-api", @@ -267,7 +266,6 @@ func TestOAuthForwardModeTokenResourceMismatch(t *testing.T) { Server: config.ServerConfig{ OAuth: config.OAuthConfig{ Enabled: true, - Mode: "forward", Issuer: "https://idp.example.com", PublicAuthServerURL: "https://mcp.example.com", SigningSecret: signingSecret, @@ -322,7 +320,7 @@ func TestRegisterOAuthHTTPRoutesAliases(t *testing.T) { app := &application{ config: config.Config{Server: config.ServerConfig{OAuth: config.OAuthConfig{ Enabled: true, - Mode: "forward", + Broker: true, Issuer: "https://idp.example.com", PublicAuthServerURL: "https://mcp.example.com", SigningSecret: "regression-aliases-32-bytes!!!!!", @@ -332,7 +330,7 @@ func TestRegisterOAuthHTTPRoutesAliases(t *testing.T) { app.registerOAuthHTTPRoutes(mux) // Each alias path must return the same JSON document (modulo OIDC's - // id_token_signing_alg_values_supported extra in gating-mode openid + // id_token_signing_alg_values_supported extra in resource-server openid // configuration — we run forward so neither path adds it). for _, path := range []string{ "/.well-known/oauth-authorization-server", @@ -413,7 +411,6 @@ func TestCIMDFullAuthCodeFlow(t *testing.T) { Server: config.ServerConfig{ OAuth: config.OAuthConfig{ Enabled: true, - Mode: "forward", Issuer: upstream.URL, JWKSURL: upstream.URL + "/jwks", AuthURL: upstream.URL + "/authorize", @@ -504,7 +501,7 @@ func TestOAuthRegisterGone(t *testing.T) { app := &application{ config: config.Config{Server: config.ServerConfig{OAuth: config.OAuthConfig{ Enabled: true, - Mode: "forward", + Broker: true, Issuer: "https://idp.example.com", PublicAuthServerURL: "https://mcp.example.com", SigningSecret: "regression-410-32bytes!!!!!!!!!!!", @@ -529,7 +526,6 @@ func TestOAuthTokenRefreshGrantUnsupported(t *testing.T) { app := &application{ config: config.Config{Server: config.ServerConfig{OAuth: config.OAuthConfig{ Enabled: true, - Mode: "forward", Issuer: "https://idp.example.com", PublicAuthServerURL: "https://mcp.example.com", SigningSecret: "regression-refresh-32bytes!!!!!!!", @@ -549,6 +545,96 @@ func TestOAuthTokenRefreshGrantUnsupported(t *testing.T) { require.Equal(t, "unsupported_grant_type", body["error"]) } +func TestOAuthTokenAuthCode_AudienceBrokerExpiresInFollowsAccessToken(t *testing.T) { + t.Parallel() + + const ( + clientID = "https://client.example.com/metadata.json" + redirectURI = "https://demo.example.com/callback" + ) + cimdServer := httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + fmt.Fprintf(w, `{"client_id":%q,"client_name":"D","redirect_uris":[%q],"token_endpoint_auth_method":"none"}`, clientID, redirectURI) + })) + defer cimdServer.Close() + + now := time.Now() + accessToken := makeUnsignedJWT(t, map[string]interface{}{ + "email": "alice@example.com", + "aud": "https://api.example.com", + "exp": now.Add(10 * time.Minute).Unix(), + }) + provider := newRegressionOIDCProvider(t, nil, nil) + idToken := provider.issueIDToken(t, map[string]interface{}{ + "iss": provider.server.URL, + "aud": "broker-client", + "sub": "alice", + "email": "alice@example.com", + "exp": now.Add(time.Hour).Unix(), + }) + provider.tokenResp = map[string]interface{}{ + "access_token": accessToken, + "id_token": idToken, + "token_type": "Bearer", + "expires_in": 3600, + } + + cfg := config.Config{Server: config.ServerConfig{OAuth: config.OAuthConfig{ + Enabled: true, + Broker: true, + Issuer: provider.server.URL, + JWKSURL: provider.server.URL + "/jwks", + AuthURL: provider.server.URL + "/authorize", + TokenURL: provider.server.URL + "/token", + ClientID: "broker-client", + ClientSecret: "s", + Audience: "https://api.example.com", + PublicAuthServerURL: "https://mcp.example.com", + SigningSecret: "regression-access-expiry-32bytes!!", + }}} + app := &application{ + config: cfg, + mcpServer: altinitymcp.NewClickHouseMCPServer(cfg, "test"), + cimdResolver: testResolver(t, cimdServer), + } + + verifier, err := broker.NewPKCEVerifier() + require.NoError(t, err) + issued := oauthIssuedCode{ + ClientID: clientID, + RedirectURI: redirectURI, + Scope: "openid email", + CodeChallenge: broker.PKCEChallenge(verifier), + CodeChallengeMethod: "S256", + UpstreamAuthCode: "upstream-code", + UpstreamPKCEVerifier: "upstream-verifier", + ExpiresAt: time.Now().Add(60 * time.Second), + } + code, err := app.encodeAuthCode(issued) + require.NoError(t, err) + + form := url.Values{} + form.Set("grant_type", "authorization_code") + form.Set("client_id", issued.ClientID) + form.Set("redirect_uri", redirectURI) + form.Set("code", code) + form.Set("code_verifier", verifier) + req := httptest.NewRequest(http.MethodPost, "https://mcp.example.com/oauth/token", strings.NewReader(form.Encode())) + req.Header.Set("Content-Type", "application/x-www-form-urlencoded") + require.NoError(t, req.ParseForm()) + + rr := httptest.NewRecorder() + app.handleOAuthTokenAuthCode(rr, req) + require.Equal(t, http.StatusOK, rr.Code, "body=%s", rr.Body.String()) + var body map[string]interface{} + require.NoError(t, json.Unmarshal(rr.Body.Bytes(), &body)) + require.Equal(t, accessToken, body["access_token"]) + expiresIn, ok := body["expires_in"].(float64) + require.True(t, ok) + require.Greater(t, expiresIn, float64(8*time.Minute/time.Second)) + require.Less(t, expiresIn, float64(12*time.Minute/time.Second), "must use access-token exp, not the one-hour id_token/token response expiry") +} + // --- AS metadata shape --------------------------------------------------- func TestOAuthASMetadataShape(t *testing.T) { @@ -556,7 +642,6 @@ func TestOAuthASMetadataShape(t *testing.T) { app := &application{ config: config.Config{Server: config.ServerConfig{OAuth: config.OAuthConfig{ Enabled: true, - Mode: "forward", Issuer: "https://idp.example.com", PublicAuthServerURL: "https://mcp.example.com", SigningSecret: "regression-shape-32bytes!!!!!!!!!", @@ -608,7 +693,6 @@ func TestOAuthTokenUpstream200WithErrorBody(t *testing.T) { cfg := config.Config{Server: config.ServerConfig{OAuth: config.OAuthConfig{ Enabled: true, - Mode: "forward", Issuer: upstream.URL, JWKSURL: upstream.URL + "/jwks", AuthURL: upstream.URL + "/authorize", @@ -652,7 +736,7 @@ func TestOAuthTokenUpstream200WithErrorBody(t *testing.T) { // --- helpers ------------------------------------------------------------- -// regressionOIDCProvider is a small fake OIDC AS used by the forward-mode +// regressionOIDCProvider is a small fake OIDC AS used by the OAuth // JWT validation tests. It signs id_tokens with RS256 and exposes JWKS at // /jwks so altinitymcp.ValidateUpstreamIdentityToken can verify them. type regressionOIDCProvider struct { @@ -738,6 +822,14 @@ func (p *regressionOIDCProvider) issueIDToken(t *testing.T, claims map[string]in return tok } +func makeUnsignedJWT(t *testing.T, claims map[string]interface{}) string { + t.Helper() + header := base64.RawURLEncoding.EncodeToString([]byte(`{"alg":"none","typ":"JWT"}`)) + payload, err := json.Marshal(claims) + require.NoError(t, err) + return header + "." + base64.RawURLEncoding.EncodeToString(payload) + ".sig" +} + // Avoid unused-import errors if some sub-tests get gated off. var _ = context.Background var _ = io.Discard diff --git a/cmd/altinity-mcp/oauth_server.go b/cmd/altinity-mcp/oauth_server.go index 22cbb43b..74b08e5f 100644 --- a/cmd/altinity-mcp/oauth_server.go +++ b/cmd/altinity-mcp/oauth_server.go @@ -49,13 +49,11 @@ const ( defaultAccessTokenTTLSeconds = 60 * 60 // brokerModeIDTokenRefreshThresholdSeconds is the remaining-life floor // below which we'll use the upstream refresh_token at /token to mint a - // fresh id_token before forwarding (#121). Set at 55 minutes so a + // fresh id_token before returning it (#121). Set at 55 minutes so a // freshly-minted Google id_token (exp = iat + 1h) is never re-fetched // but anything Google reused from a warm session is. Only fires when // upstream_offline_access is enabled AND the upstream actually returned - // a refresh_token. Applies to all broker-mode deployments (forward and - // gating+broker_upstream alike) since the bearer is the id_token in - // both. + // a refresh_token. brokerModeIDTokenRefreshThresholdSeconds = 55 * 60 ) @@ -147,27 +145,39 @@ func (a *application) oauthEnabled() bool { return a.GetCurrentConfig().Server.OAuth.Enabled } -func (a *application) oauthMode() string { - return a.GetCurrentConfig().Server.OAuth.NormalizedMode() -} - -func (a *application) oauthForwardMode() bool { - return a.oauthMode() == "forward" -} - // oauthBrokerMode reports whether altinity-mcp is acting as the OAuth AS to -// MCP clients (/authorize + /token + /callback). True for forward mode -// always; true for gating mode iff the operator opts in via -// oauth.broker_upstream. When true, /oauth/* routes are registered and the -// broker-flow handlers fire. The /mcp request path still differs per mode — -// forward forwards the upstream bearer to CH as Bearer; gating sends Basic -// base64(email:JWT) for the ch-jwt-verify sidecar to validate. +// MCP clients (/authorize + /token + /callback). When true, /oauth/* routes +// are registered and the broker-flow handlers fire. +// CH auth wire format (Bearer vs Basic) is auto-detected at runtime. func (a *application) oauthBrokerMode() bool { + return a.GetCurrentConfig().Server.OAuth.Broker +} + +// brokerUpstreamAudience returns the API audience to request from the upstream +// IdP at /authorize, or "" when none should be sent. +// +// Auth0 (and other RFC-8707-via-`audience` IdPs) mint a JWT *access* token with +// `aud=` only when the authorize request carries an `audience` +// param; without it they return an opaque access token plus an id_token whose +// `aud` is the client_id. A CH-side ch-jwt-verify sidecar validating +// `aud=` therefore can only be satisfied if we (a) request the +// audience upstream and (b) forward the resulting access token. We key this on +// the configured `audience` so the value byte-equals what the sidecar expects. +// +// Google is explicitly excluded: it ignores `audience`, never issues +// arbitrary-aud JWT access tokens, and its CH `token_processors` path validates +// the id_token (aud=client_id) directly. Sending `audience` to Google is at +// best a no-op and at worst an `invalid_request`, so the Google broker path +// (antalya, github) is left exactly as before. +func (a *application) brokerUpstreamAudience() string { cfg := a.GetCurrentConfig().Server.OAuth - if cfg.IsForwardMode() { - return true + if !a.oauthBrokerMode() { + return "" + } + if broker.IsGoogleIssuer(cfg.Issuer) { + return "" } - return cfg.IsGatingMode() && cfg.BrokerUpstream + return strings.TrimSpace(cfg.Audience) } func (a *application) oauthJWESecret() []byte { @@ -380,6 +390,36 @@ func (a *application) oauthTokenPath() string { func (a *application) oauthChallengeHeader(r *http.Request, errCode, errDesc, scope string) string { baseURL := a.resourceBaseURL(r) resourceMetadata := broker.JoinURLPath(baseURL, defaultProtectedResourceMetadataPath) + // Multi-cluster: prefer the per-cluster PRM path so claude.ai / + // ChatGPT pull cluster-correct metadata on the OAuth bootstrap. The + // router places the validated cluster name on ctx; if it's not there + // (single-cluster, or an /oauth/* path) fall through to the host-root. + // + // The fetch URL is built from the SERVING host (schemeAndHost), not from + // baseURL. baseURL honours public_resource_url, which is the *logical* + // resource identifier — advertised in the PRM `resource` field and + // round-tripped by the client to the token `aud`. That identifier may be + // a shared/foreign audience hosted elsewhere (e.g. several clusters + // pinned to one cluster's CH-side sidecar audience). The RFC 9728 + // metadata document, by contrast, is served by THIS process at + // /mcp/{cluster}/.well-known/oauth-protected-resource on the request + // host, so the resource_metadata URL must point there or discovery + // dead-ends on a host that doesn't serve it. When public_resource_url + // equals the serving host (the canonical single-audience deployment) + // this is byte-identical to the previous behaviour. + if cluster, ok := altinitymcp.ClusterFromContext(r.Context()); ok { + metaBase := a.schemeAndHost(r) + // schemeAndHost can under-report https behind a TLS-terminating edge + // proxy when neither OpenAPI.TLS nor Server.TLS is set — and MC mode + // forbids OpenAPI, so there is no knob to force https there. The + // advertised resource identifier (baseURL) carries the real public + // scheme; mirror it onto the serving host so the metadata URL is + // fetchable over the same https the client used to reach us. + if strings.HasPrefix(baseURL, "https://") && strings.HasPrefix(metaBase, "http://") { + metaBase = "https://" + strings.TrimPrefix(metaBase, "http://") + } + resourceMetadata = broker.JoinURLPath(metaBase, "/mcp/"+cluster+defaultProtectedResourceMetadataPath) + } if errCode == "" { errCode = "invalid_token" } @@ -479,11 +519,10 @@ func (a *application) createMCPAuthInjector(cfg config.Config) func(http.Handler a.writeOAuthError(w, r, altinitymcp.ErrMissingOAuthToken) return } - // MCP is a pure forwarder: the CH-side ch-jwt-verify sidecar - // performs signature/iss/aud/exp/scope validation against the - // upstream JWKS for every query. Per-request validation here - // would duplicate work for opaque tokens and add a JWKS hop - // on the hot path; skip it and let the sidecar gate. Claims + // The CH-side ch-jwt-verify sidecar performs signature/iss/aud/ + // exp/scope validation against the upstream JWKS for every query. + // Per-request validation here would duplicate work for opaque + // tokens and add a JWKS hop on the hot path. Claims // stay unset on the context — ClaimsFromContext returns nil // either way, so no downstream nil-deref risk. ctx = context.WithValue(ctx, altinitymcp.OAuthTokenKey, oauthToken) @@ -611,11 +650,11 @@ func (a *application) resolveUpstreamTokenURL() (string, error) { } // refreshUpstreamIDToken exchanges the upstream refresh_token for a fresh -// id_token via the upstream's RFC 6749 §6 refresh-token grant. Used in -// forward mode at /token when the just-redeemed id_token has a short -// remaining life (Google's silent-SSO can return a cached id_token whose -// `exp` is near). The returned refresh_token (if any) is discarded — #115 -// keeps downstream refresh out of scope. +// id_token via the upstream's RFC 6749 §6 refresh-token grant. Used in broker +// mode at /token when the just-redeemed id_token has a short remaining life +// (Google's silent-SSO can return a cached id_token whose `exp` is near). The +// returned refresh_token (if any) is discarded — #115 keeps downstream refresh +// out of scope. // // Returns the fresh id_token plus its parsed identity claims. On any // failure the caller should fall back to the original (near-expired) @@ -659,8 +698,8 @@ func (a *application) refreshUpstreamIDToken(refreshToken string) (string, *alti } if parsed.IDToken == "" { // Some IdPs only return a new access_token on refresh, no id_token. - // We need id_token specifically (it's what we forward as the bearer - // in forward mode), so treat absence as failure and fall back. + // We need id_token specifically for broker clients, so treat absence + // as failure and fall back. return "", nil, fmt.Errorf("upstream returned no id_token") } claims, err := a.mcpServer.ValidateUpstreamIdentityToken(parsed.IDToken, cfg.Server.OAuth.ClientID) @@ -678,20 +717,11 @@ func (a *application) handleOAuthProtectedResource(w http.ResponseWriter, r *htt cfg := a.GetCurrentConfig().Server.OAuth baseURL := a.resourceBaseURL(r) // authorization_servers advertises where the MCP client should go to get - // a token. Three shapes: - // - Pure gating (#109, no broker): MCP is a pure resource server and - // the external AS (configured via `oauth.issuer`) is responsible for - // DCR / authorize / token. Advertise the upstream issuer byte-equal - // to what tokens carry in their `iss` claim. - // - Forward mode: MCP fronts the upstream IdP and is itself the AS to - // MCP clients. Advertise our own auth-server base URL. - // - Gating + broker_upstream: same as forward mode from the - // MCP-client's perspective — MCP exposes /oauth/{register,authorize, - // callback,token} and brokers the upstream IdP behind the scenes. - // Advertise our own auth-server base URL, NOT the upstream issuer. - // If we advertised the upstream issuer here, claude.ai/ChatGPT would - // try to DCR against it directly (which most upstreams reject) and - // never discover our broker endpoints. + // a token: + // - broker:false: MCP is a pure resource server and the external AS + // (configured via oauth.issuer) is responsible for authorization. + // - broker:true: MCP exposes /oauth/{register,authorize,callback,token} + // and brokers the upstream IdP behind the scenes. var authorizationServers []string if a.oauthBrokerMode() { authorizationServers = []string{strings.TrimRight(a.oauthAuthorizationServerBaseURL(r), "/")} @@ -729,7 +759,7 @@ func handleOAuthRegisterRemoved(w http.ResponseWriter, _ *http.Request) { // oauthASMetadata returns the field set shared by RFC 8414 (oauth-authorization-server) // and OIDC Discovery (openid-configuration). Both endpoints serve the same -// AS-side advertisement; OIDC adds two extra fields under gating mode (see +// AS-side advertisement; OIDC adds two extra fields when broker=false (see // handleOAuthOpenIDConfiguration). // // `issuer` is published without a trailing slash to match RFC 8414 §2 @@ -767,7 +797,7 @@ func (a *application) handleOAuthOpenIDConfiguration(w http.ResponseWriter, r *h return } resp := a.oauthASMetadata(r) - if !a.oauthForwardMode() { + if !a.oauthBrokerMode() { resp["subject_types_supported"] = []string{"public"} resp["id_token_signing_alg_values_supported"] = []string{"HS256"} } @@ -897,6 +927,13 @@ func (a *application) handleOAuthAuthorize(w http.ResponseWriter, r *http.Reques upstream.Set("state", callbackState) upstream.Set("code_challenge", broker.PKCEChallenge(upstreamVerifier)) upstream.Set("code_challenge_method", "S256") + // Auth0-style upstreams need an explicit `audience` to mint a JWT access + // token with aud= (the form a ch-jwt-verify sidecar validates). + // No-op for Google (see brokerUpstreamAudience). + if aud := a.brokerUpstreamAudience(); aud != "" { + upstream.Set("audience", aud) + log.Debug().Str("upstream_audience", aud).Msg("OAuth /authorize: requesting upstream API audience for broker access token") + } http.Redirect(w, r, authURL+"?"+upstream.Encode(), http.StatusFound) } @@ -944,7 +981,7 @@ func (a *application) handleOAuthCallback(w http.ResponseWriter, r *http.Request log.Info(). Str("client_id", truncateForLog(pending.ClientID, 80)). - Bool("forward_mode", a.oauthForwardMode()). + Bool("broker", a.oauthBrokerMode()). Msg("OAuth /callback wrapped upstream auth code in downstream JWE; awaiting /token redemption") redirect, err := url.Parse(pending.RedirectURI) @@ -978,7 +1015,7 @@ func (a *application) handleOAuthToken(w http.ResponseWriter, r *http.Request) { grantType := r.Form.Get("grant_type") log.Info(). Str("grant_type", grantType). - Bool("forward_mode", a.oauthForwardMode()). + Bool("broker", a.oauthBrokerMode()). Msg("OAuth /oauth/token request received") switch grantType { case "authorization_code": @@ -1191,7 +1228,7 @@ func (a *application) handleOAuthTokenAuthCode(w http.ResponseWriter, r *http.Re log.Info(). Bool("has_access_token", tokenResp.AccessToken != ""). Bool("has_id_token", tokenResp.IDToken != ""). - Bool("forward_mode", a.oauthForwardMode()). + Bool("broker", a.oauthBrokerMode()). Str("scope", tokenResp.Scope). Int64("expires_in", tokenResp.ExpiresIn). Str("client_id", truncateForLog(clientID, 80)). @@ -1201,7 +1238,7 @@ func (a *application) handleOAuthTokenAuthCode(w http.ResponseWriter, r *http.Re // client. Claims are NOT bound into the downstream token (audience // binding deferred per #115 § Non-goals); the validation has three // jobs: fail-fast on a malformed upstream response with a proper 502, - // confirm the upstream id_token signature/audience for forward mode, + // confirm the upstream id_token signature/audience for broker mode, // and surface the id_token `exp` so we report an accurate `expires_in` // to the MCP client below (used at the "expiresIn = identityClaims..." // line further down). @@ -1221,14 +1258,9 @@ func (a *application) handleOAuthTokenAuthCode(w http.ResponseWriter, r *http.Re return } } - // #121: whenever we're in broker mode the bearer we return is the - // upstream id_token itself (`bearerToken := tokenResp.IDToken` above). - // Google's silent-SSO can return a cached id_token whose `exp` is set - // from the original mint time, not now — sometimes leaving only minutes - // of remaining life. Forward mode forwards that bearer to ClickHouse; - // gating-with-broker_upstream returns it as the session bearer the MCP - // client uses for /mcp. Both modes are affected — gate on broker mode, - // not forward mode. + // #121: in broker mode Google's silent-SSO can return a cached id_token + // whose `exp` is set from the original mint time, not now — sometimes + // leaving only minutes of remaining life. // // If we have a refresh_token and the id_token is below the freshness // threshold, exchange it for a fresh id_token. Soft-fail: keep the @@ -1241,7 +1273,7 @@ func (a *application) handleOAuthTokenAuthCode(w http.ResponseWriter, r *http.Re if refreshErr != nil { log.Warn().Err(refreshErr). Int64("remaining_seconds", remaining). - Msg("OAuth /token: id_token refresh failed; forwarding original near-expired token") + Msg("OAuth /token: id_token refresh failed; returning original near-expired token") } else { tokenResp.IDToken = freshIDToken identityClaims = freshClaims @@ -1263,15 +1295,36 @@ func (a *application) handleOAuthTokenAuthCode(w http.ResponseWriter, r *http.Re tokenType = "Bearer" } bearerToken := tokenResp.IDToken + bearerIsAccessToken := false if bearerToken == "" { bearerToken = tokenResp.AccessToken + bearerIsAccessToken = true + } + // When we asked the upstream for an API audience (Auth0 + sidecar path), + // the bearer the MCP client must present — and that we forward to CH — is + // the *access* token: it carries aud= (what the sidecar validates) + // whereas the id_token's aud is the client_id. The access token also + // carries `email` here (the upstream API is configured to include it), so + // the Basic base64(email:token) rewrite and the sidecar's username_claim + // both still resolve. Falls back to the id_token if no access token came + // back. Google broker deployments keep the id_token path untouched. + if a.brokerUpstreamAudience() != "" && tokenResp.AccessToken != "" { + bearerToken = tokenResp.AccessToken + bearerIsAccessToken = true } var expiresIn int64 - if tokenResp.IDToken != "" && identityClaims != nil && identityClaims.ExpiresAt > 0 { + if bearerIsAccessToken && tokenResp.AccessToken != "" { + if exp, ok := altinitymcp.BearerExp(tokenResp.AccessToken); ok { + expiresIn = int64(time.Until(exp).Seconds()) + } else if tokenResp.ExpiresIn > 0 { + expiresIn = tokenResp.ExpiresIn + } + } else if tokenResp.IDToken != "" && identityClaims != nil && identityClaims.ExpiresAt > 0 { expiresIn = identityClaims.ExpiresAt - time.Now().Unix() } else if tokenResp.ExpiresIn > 0 { expiresIn = tokenResp.ExpiresIn - } else { + } + if expiresIn == 0 { expiresIn = int64(defaultAccessTokenTTLSeconds) } if expiresIn < 0 { @@ -1300,13 +1353,12 @@ func truncateForLog(value string, max int) string { func (a *application) registerOAuthHTTPRoutes(mux *http.ServeMux) { // RFC 9728 protected-resource metadata is the only OAuth endpoint MCP - // itself owns under pure gating mode (#109): MCP is a pure resource - // server and points clients at the upstream IdP for everything else. - // Under forward mode — and under gating mode with `broker_upstream=true` - // — MCP also fronts the upstream IdP as a proxying AS, so - // /oauth/register, /authorize, /callback, /token, and the AS-discovery - // metadata endpoints stay registered on that path. The decision is - // centralised in oauthBrokerMode(). + // itself owns when broker=false: MCP is a pure resource server and points + // clients at the upstream IdP for everything else. When broker=true, MCP + // also fronts the upstream IdP as a proxying AS, so /oauth/register, + // /authorize, /callback, /token, and the AS-discovery metadata endpoints + // stay registered on that path. The decision is centralised in + // oauthBrokerMode(). mux.HandleFunc(defaultProtectedResourceMetadataPath, a.handleOAuthProtectedResource) if !a.oauthBrokerMode() { diff --git a/cmd/altinity-mcp/oauth_server_test.go b/cmd/altinity-mcp/oauth_server_test.go index 0b018668..0fb26dee 100644 --- a/cmd/altinity-mcp/oauth_server_test.go +++ b/cmd/altinity-mcp/oauth_server_test.go @@ -39,21 +39,19 @@ func TestOAuthMCPAuthInjector(t *testing.T) { }, OAuth: config.OAuthConfig{ Enabled: true, - Mode: "gating", Issuer: "https://accounts.example.com", PublicAuthServerURL: "https://mcp.example.com", Audience: "https://mcp.example.com", - SigningSecret: "test-gating-secret-32-byte-key!!", + SigningSecret: "test-signing-secret-32-byte-key!!", }, }, }, mcpServer: altinitymcp.NewClickHouseMCPServer(config.Config{Server: config.ServerConfig{JWE: config.JWEConfig{Enabled: true, JWESecretKey: "this-is-a-32-byte-secret-key!!", JWTSecretKey: "jwt-secret"}, OAuth: config.OAuthConfig{ Enabled: true, - Mode: "gating", Issuer: "https://accounts.example.com", PublicAuthServerURL: "https://mcp.example.com", Audience: "https://mcp.example.com", - SigningSecret: "test-gating-secret-32-byte-key!!", + SigningSecret: "test-signing-secret-32-byte-key!!", }}}, "test"), } @@ -108,7 +106,6 @@ func TestOAuthMCPAuthInjectorForwardModePassesOpaqueBearerToken(t *testing.T) { Server: config.ServerConfig{ OAuth: config.OAuthConfig{ Enabled: true, - Mode: "forward", }, }, }, @@ -116,7 +113,6 @@ func TestOAuthMCPAuthInjectorForwardModePassesOpaqueBearerToken(t *testing.T) { Server: config.ServerConfig{ OAuth: config.OAuthConfig{ Enabled: true, - Mode: "forward", }, }, }, "test"), @@ -140,7 +136,7 @@ func TestOAuthMCPAuthInjectorForwardModePassesOpaqueBearerToken(t *testing.T) { } // TestOAuthMCPAuthInjectorForwardModeValidatesJWT is the integration check -// for the C-1 fix: forward mode used to skip ValidateOAuthToken entirely, +// for the C-1 fix: OAuth used to skip ValidateOAuthToken entirely, // so any string in `Authorization: Bearer …` reached the inner handler // and was forwarded to ClickHouse. After C-1 the auth layer validates JWT // bearers when Issuer/JWKSURL is configured and rejects bad ones at 401. @@ -162,7 +158,7 @@ func exchangeOAuthBrowserCode(t *testing.T, app *application, clientID, code, re } // TestOAuthForwardModeTokenResourceMismatch pins the RFC 8707 §2.2 enforcement -// in forward mode: a /token (auth-code grant) request whose `resource` differs +// in OAuth: a /token (auth-code grant) request whose `resource` differs // from the one already pinned at /authorize must be rejected with // invalid_target, regardless of which mode we're running in. func generateOAuthTokenForApp(claims map[string]interface{}) (string, error) { @@ -170,7 +166,7 @@ func generateOAuthTokenForApp(claims map[string]interface{}) (string, error) { if err != nil { return "", err } - hashedSecret := jwe_auth.HashSHA256([]byte("test-gating-secret-32-byte-key!!")) + hashedSecret := jwe_auth.HashSHA256([]byte("test-signing-secret-32-byte-key!!")) signer, err := jose.NewSigner(jose.SigningKey{Algorithm: jose.HS256, Key: hashedSecret}, (&jose.SignerOptions{}).WithType("JWT")) if err != nil { return "", err @@ -213,9 +209,6 @@ func newJWEStateTestApp(secret string) *application { return &application{config: cfg} } -// newGatingModeTestApp creates an application configured for gating mode OAuth. -// doGatingAuthCodeFlow runs the full authorize→callback→token exchange and -// returns the parsed token response. func exchangeRefreshToken(t *testing.T, app *application, clientID, refreshToken string) *httptest.ResponseRecorder { t.Helper() form := url.Values{} @@ -230,9 +223,6 @@ func exchangeRefreshToken(t *testing.T, app *application, clientID, refreshToken return rr } -// newForwardModeRefreshTestApp configures a forward-mode app with -// UpstreamOfflineAccess enabled, so the auth-code response carries a JWE -// refresh_token wrapping the upstream IdP's refresh token. func TestNormalizedPath(t *testing.T) { t.Parallel() tests := []struct { @@ -633,7 +623,6 @@ func TestOAuthAuthorizeErrorsAreJSON(t *testing.T) { app := &application{ config: config.Config{Server: config.ServerConfig{OAuth: config.OAuthConfig{ Enabled: true, - Mode: "forward", Issuer: "https://idp.example.com", PublicAuthServerURL: "https://mcp.example.com", SigningSecret: "regression-f1-jsonerr-32bytes!!!!", @@ -684,3 +673,98 @@ func TestOAuthAuthorizeErrorsAreJSON(t *testing.T) { require.Equal(t, "invalid_request", body["error"]) }) } + +// TestOAuthChallengeHeaderMulticluster pins the multi-cluster WWW-Authenticate +// resource_metadata URL to the SERVING host with the public https scheme, even +// when public_resource_url advertises a foreign audience hosted elsewhere +// (the shared-sidecar-audience deployment shape). Regression guard for the +// cross-host discovery dead-end found in #134 e2e: the metadata URL must point +// where THIS process serves the per-cluster PRM, not at the audience's host. +func TestOAuthChallengeHeaderMulticluster(t *testing.T) { + t.Parallel() + + // public_resource_url is a FOREIGN host (the cluster's CH-side sidecar + // audience); the server itself is reached at multi-mcp.example.com. + app := &application{ + config: config.Config{ + Server: config.ServerConfig{ + OAuth: config.OAuthConfig{ + Enabled: true, + Issuer: "https://accounts.example.com", + Audience: "https://otel-mcp.example.com/", + PublicResourceURL: "https://otel-mcp.example.com/", + }, + }, + }, + } + + const wantPath = "/mcp/otel/.well-known/oauth-protected-resource" + + extract := func(challenge string) string { + for _, p := range strings.Split(challenge, ",") { + p = strings.TrimSpace(p) + if strings.HasPrefix(p, "resource_metadata=") { + v := strings.TrimPrefix(p, "resource_metadata=") + return strings.Trim(v, `"`) + } + } + return "" + } + + t.Run("https request keeps serving host, not foreign audience host", func(t *testing.T) { + req := httptest.NewRequest(http.MethodPost, "https://multi-mcp.example.com/mcp/otel", nil) + req = req.WithContext(altinitymcp.WithCluster(req.Context(), "otel")) + got := extract(app.oauthChallengeHeader(req, "", "", "")) + require.Equal(t, "https://multi-mcp.example.com"+wantPath, got, + "resource_metadata must point at the serving host where the per-cluster PRM is actually served") + require.NotContains(t, got, "otel-mcp.example.com", + "resource_metadata must not inherit public_resource_url's (foreign audience) host") + }) + + t.Run("scheme mirrored to https behind TLS-terminating proxy", func(t *testing.T) { + // r.TLS nil (plain HTTP to the pod) + OpenAPI/Server TLS off — the + // MC-forbidden-OpenAPI case where schemeAndHost would emit http://. + req := httptest.NewRequest(http.MethodPost, "http://multi-mcp.example.com/mcp/otel", nil) + req = req.WithContext(altinitymcp.WithCluster(req.Context(), "otel")) + got := extract(app.oauthChallengeHeader(req, "", "", "")) + require.Equal(t, "https://multi-mcp.example.com"+wantPath, got, + "public https scheme (from the advertised resource) must be mirrored onto the serving host") + }) + + t.Run("single-cluster (no cluster on ctx) unchanged", func(t *testing.T) { + req := httptest.NewRequest(http.MethodPost, "https://otel-mcp.example.com/", nil) + got := extract(app.oauthChallengeHeader(req, "", "", "")) + require.Equal(t, "https://otel-mcp.example.com/.well-known/oauth-protected-resource", got, + "non-MC path must still resolve via resourceBaseURL (public_resource_url)") + }) +} + +// TestBrokerAudience guards the gate that decides whether to request an +// API `audience` from the upstream IdP (so Auth0 mints a JWT access token with +// aud= for the ch-jwt-verify sidecar). Google must never get it. +func TestBrokerAudience(t *testing.T) { + t.Parallel() + mk := func(o config.OAuthConfig) *application { + o.Enabled = true + return &application{config: config.Config{Server: config.ServerConfig{OAuth: o}}} + } + const auth0 = "https://altinity.auth0.com/" + const google = "https://accounts.google.com" + const aud = "https://otel-mcp.demo.altinity.cloud/" + + cases := []struct { + name string + cfg config.OAuthConfig + want string + }{ + {"broker+auth0+aud → request it", config.OAuthConfig{Broker: true, Issuer: auth0, Audience: aud}, aud}, + {"broker+auth0+no aud → none", config.OAuthConfig{Broker: true, Issuer: auth0}, ""}, + {"broker+google+aud → none (Google excluded)", config.OAuthConfig{Broker: true, Issuer: google, Audience: aud}, ""}, + {"resource-server+auth0+aud → none", config.OAuthConfig{Issuer: auth0, Audience: aud}, ""}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + require.Equal(t, tc.want, mk(tc.cfg).brokerUpstreamAudience()) + }) + } +} diff --git a/docs/development_and_testing.md b/docs/development_and_testing.md index 1f463442..6e3e5a34 100644 --- a/docs/development_and_testing.md +++ b/docs/development_and_testing.md @@ -73,7 +73,7 @@ If Docker is unavailable, the default suite will not be reliable. ## OAuth End-to-End Tests -OAuth e2e tests validate bearer-token forwarding through MCP to ClickHouse. They use a lightweight in-process mock OIDC provider and an `altinity/clickhouse-server:25.8.16.20001.altinityantalya` container (required for `token_processors` support — standard ClickHouse images do not include it). +OAuth e2e tests validate bearer-token authentication through MCP to ClickHouse. They use a lightweight in-process mock OIDC provider and an `altinity/clickhouse-server:25.8.16.20001.altinityantalya` container (required for `token_processors` support — standard ClickHouse images do not include it). These tests run automatically as part of `go test ./...` (skipped with `-short`). diff --git a/docs/oauth_authorization.md b/docs/oauth_authorization.md index e3b127d7..7f8ed5b0 100644 --- a/docs/oauth_authorization.md +++ b/docs/oauth_authorization.md @@ -1,228 +1,23 @@ # OAuth 2.0 Authorization for Altinity MCP Server How OAuth 2.0 / OpenID Connect (OIDC) authentication works in -`altinity-mcp`, when to pick each mode, and how to wire it up against -common identity providers. +`altinity-mcp` and how to wire it up against common identity providers. -Companion: the ClickHouse-side JWT verifier sidecar used by gating mode -lives in [`altinity-oauth-helper`](https://github.com/altinity/altinity-oauth-helper) — +Companion: the ClickHouse-side JWT verifier sidecar lives in +[`altinity-oauth-helper`](https://github.com/altinity/altinity-oauth-helper) — that repo carries the spec, source, helm chart, and Dockerfile. ## Overview -Two modes, picked per deployment via `oauth.mode`: - -- **`mode: gating`** — MCP is a pure forwarder. Each query carries - `Authorization: Basic base64(email:JWT)` to ClickHouse; ClickHouse's - [``](https://clickhouse.com/docs/operations/external-authenticators/http) - delegates the password check to the `ch-jwt-verify` sidecar, which - validates the JWT against the upstream IdP's JWKS and applies identity - policy. Works on any ClickHouse build; the sidecar must be deployed - next to ClickHouse. - -- **`mode: forward`** — MCP is the OAuth Authorization Server to its - clients (terminates Client ID Metadata Documents (CIMD) registration, - `/authorize`, `/token`, `/callback`) and relays the upstream IdP's - token to ClickHouse via `Authorization: Bearer `. ClickHouse - re-validates with its `token_processors` and materializes ephemeral - users from the JWT's claims. Requires a ClickHouse build that supports - `token_processors` (Altinity Antalya 25.8+ or an equivalent JWT-aware - build). - -Detailed flows: [Gating mode](#gating-mode), [Forward mode](#forward-mode). -Decision rationale: [Choosing a mode](#choosing-a-mode). - -## Mode taxonomy - -| Mode | What MCP does | What ClickHouse does | When to use | -|---|---|---|---| -| **`gating`** | Unverified-decodes the JWT's `email` claim. Rewrites `Authorization: Bearer ` to `Authorization: Basic base64(email:JWT)`. Forces HTTP protocol. No cryptographic check per request — the sidecar is the gate. | `` calls the `ch-jwt-verify` sidecar. Sidecar validates signature, `iss`, `aud` (RFC 8707 byte-equal), `exp`/`nbf`/`iat`, required scopes, identity policy, user-vs-claim match. | Works on any ClickHouse build, including OSS. Sidecar must be reachable from the CH pod (loopback for the production trust model). | -| **`forward`** | Acts as the OAuth Authorization Server to the MCP client: accepts CIMD-registered clients, runs `/authorize` + `/token` + `/callback`, brokers the upstream IdP. Relays the upstream JWT to ClickHouse as `Bearer`. | `token_processors` cryptographically validates the bearer (JWKS, `aud`, `exp`) and materializes ephemeral users from claims via the `` `user_directory`. | When the IdP doesn't expose CIMD-compatible endpoints (Google direct, basic-tier Auth0) and ClickHouse supports `token_processors`. | - -Both modes require ClickHouse over HTTP (port 8123 typically). TCP/native -is incompatible with both `` and `token_processors`. - -## Choosing a mode - -Use **gating** when: - -- Your ClickHouse build doesn't support `token_processors` (OSS builds). -- You can deploy a sidecar container next to ClickHouse. -- You prefer pre-provisioned ClickHouse users over per-claim ephemeral - ones. -- You want the cryptographic gate next to the data plane (sidecar) with - MCP holding no secrets at all. - -Use **forward** when: - -- Your ClickHouse build supports `token_processors` (Altinity Antalya - 25.8+). -- You want **ephemeral CH users materialized from JWT claims** — no - pre-`CREATE USER` step per identity. -- You can't or don't want to deploy a sidecar. - -### What's actually different - -| | Gating | Forward | -|---|---|---| -| OAuth Authorization Server | Upstream IdP | MCP (brokering the upstream IdP) | -| Bearer the MCP client receives | Upstream IdP's JWT (TTL set per the IdP's access-token policy) | Upstream IdP id_token (raw passthrough) | -| MCP → ClickHouse credential | `Authorization: Basic base64(email:JWT)` over HTTP | `Authorization: Bearer ` over HTTP | -| Who validates the bearer on every query | The `ch-jwt-verify` sidecar | ClickHouse via `token_processors` | -| CH user provisioning | Pre-create with `IDENTIFIED WITH http SERVER 'ch_jwt_verify' SCHEME 'BASIC'` | Dynamic — `token_processors` materializes ephemeral users from JWT claims | -| ClickHouse build requirement | Any (OSS too); needs the sidecar | Antalya 25.8+ | -| ClickHouse protocol | HTTP only | HTTP only | -| Identity in `system.query_log` | The matched CH user (= JWT email) | The JWT subject directly | -| Refresh-token rotation + reuse detection | Upstream IdP | Upstream IdP (when `upstream_offline_access: true`) | - -### The trust-boundary argument - -In **gating** mode, MCP holds no shared secret with ClickHouse and has -no authority to impersonate users. Every query is gated by a -cryptographic check performed by the sidecar against the upstream IdP's -JWKS. Compromise of the MCP pod buys an attacker nothing the inbound -bearer doesn't already grant. When the sidecar runs colocated in the CH -pod, the CH↔sidecar channel is loopback-only — not network-reachable -from anywhere outside the pod. - -In **forward** mode, ClickHouse re-validates the upstream JWT on every -query via `token_processors`. Same trust property: the cryptographic -gate sits next to the data plane. - -Both modes keep the cryptographic identity check at ClickHouse, not at -MCP. - -### Dynamic user provisioning - -Forward mode's `token_processors` reads JWT claims (`email`, `roles`, -custom claims) and materializes an ephemeral CH user with the right -grants on the fly — no manual `CREATE USER` per identity. Useful for -multi-tenant deployments where the user roster comes from the IdP. - -Gating mode requires `CREATE USER IDENTIFIED WITH http -SERVER 'ch_jwt_verify' SCHEME 'BASIC'` for each OAuth identity. For a -fixed roster of internal users this is straightforward; for a large -churning user base it's a maintenance burden. - -### Token lifecycle - -Gating mode tokens are managed by the upstream IdP. The IdP's -access-token TTL is what bounds revocation latency — pick it on the -IdP side to match your security/availability trade-off (shorter TTL = -faster revocation, more refresh load). Refresh-token rotation and reuse -detection are also the IdP's responsibility. - -Forward mode tokens are also the upstream IdP's. With -`upstream_offline_access: true`, MCP wraps the upstream refresh token in -a JWE keyed by `signing_secret` and returns it to the MCP client; on -refresh, MCP unwraps, calls the upstream `/token` with the upstream -refresh, and returns the rotated pair. Revocation lands at the next -query (subject to JWKS cache TTL + ClickHouse's own caching). - -### When forward mode is the wrong choice - -- ClickHouse build doesn't support `token_processors`. Forward sends the - bearer; CH 403s every query. -- The IdP supports CIMD + RFC 8707 and you don't need ephemeral CH - users from JWT claims — gating + the sidecar is simpler. +Set `oauth.broker: true`. That single flag makes altinity-mcp: -### When gating mode is the wrong choice - -- You want ephemeral user provisioning from JWT claims. -- You can't deploy a sidecar into (or next to) the CH pod — e.g. a - managed CH offering that doesn't expose pod-template editing. - -## Requirements - -- **ClickHouse protocol**: HTTP (port 8123 typically). Both modes route - credentials through CH's HTTP interface; TCP/native has no equivalent. -- **ClickHouse build**: - - Forward: Altinity Antalya 25.8+ or any CH build with - `token_processors`. - - Gating: any build that supports `` and - `IDENTIFIED WITH http` (CH 24.x+ for OSS). -- **Identity Provider**: any OAuth 2.0 / OIDC-compliant IdP. Inbound - client registration uses [Client ID Metadata Documents (CIMD)](https://datatracker.ietf.org/doc/draft-ietf-oauth-client-id-metadata-document/); - Dynamic Client Registration (DCR, RFC 7591) is not exposed by MCP. -- **`signing_secret`**: required whenever OAuth is enabled. Symmetric - secret (≥ 32 bytes) that HKDF-derives keys for all stateless OAuth - artifacts (forward-mode authorization codes, refresh-token JWEs, - HKDF-derived signing material). Generate with `openssl rand -base64 32`. -- **Reverse proxy**: if published behind a proxy, set `public_resource_url` - (both modes) and `public_auth_server_url` (forward only). See - [Frontend / Reverse Proxy](#frontend--reverse-proxy-requirements). - -## MCP client discovery flow - -OAuth-capable MCP clients (Claude, Codex, etc.) discover authentication -automatically per [RFC 9728 (OAuth 2.0 Protected Resource Metadata)](https://www.rfc-editor.org/rfc/rfc9728): - -1. Client `GET`s `/.well-known/oauth-protected-resource` from the MCP - endpoint. -2. **Gating**: response `authorization_servers` points to the **upstream - IdP**. Client fetches `/.well-known/oauth-authorization-server` from - the IdP. MCP's `/.well-known/oauth-authorization-server` returns 404 - (MCP isn't the AS in gating mode). -3. **Forward**: response `authorization_servers` points to **MCP - itself**. Client fetches MCP's - `/.well-known/oauth-authorization-server`, which advertises CIMD - support, omits the DCR `registration_endpoint`, and lists - `grant_types_supported: ["authorization_code"]` plus - `token_endpoint_auth_methods_supported: ["none", "private_key_jwt"]`. -4. Client publishes a CIMD document at a URL it controls and uses that - URL as its `client_id` at `/authorize`. -5. Client initiates the authorization-code flow with S256 PKCE. -6. After login, client exchanges the code for an access token (and, - in forward mode with `upstream_offline_access: true`, a JWE-wrapped - refresh token). -7. Client presents the access token on every MCP request. - -## Forward mode - -ClickHouse with `token_processors` (Antalya 25.8+) validates the -upstream JWT directly; MCP brokers the OAuth dance and forwards the -bearer. - -``` -+--------+ +---------+ +----------+ +-------------+ -| MCP |----->| IdP | | MCP | | ClickHouse | -| Client |<-----| | | Server | | (Antalya) | -| | +---------+ | | | | -| | | | | | -| |---Bearer token------->| | | | -| | |--Bearer->| | | -| | | token |----->| token_proc | -| | | | | validates | -| |<----------------------|<---------|<-----| via JWKS | -| | query results | | | | -+--------+ +----------+ +-------------+ -``` - -Flow: - -1. MCP client discovers MCP-as-AS via RFC 9728 → fetches MCP's AS - metadata → publishes a CIMD document → starts the auth-code flow - against MCP. -2. MCP brokers the upstream IdP using its configured `client_id` / - `client_secret` (operator-issued OAuth application). -3. The upstream's id_token comes back; MCP wraps any refresh token in a - JWE (when `upstream_offline_access: true`) and returns the - id_token unchanged to the MCP client as the access token. -4. The MCP client presents the bearer on each MCP request. MCP forwards - it to ClickHouse as `Authorization: Bearer `. -5. ClickHouse's `token_processors` re-validates against the upstream - JWKS, materializes an ephemeral user, and runs RBAC. - -> **Spec deviation (deliberate).** MCP authorization spec §Access Token -> Privilege Restriction says *"the MCP server MUST NOT pass through the -> token it received from the MCP client"*. Forward mode does pass it -> through — by design. Justification: ClickHouse re-validates the same -> JWT against the upstream JWKS, extracts the same identity, and runs -> its own RBAC. The MCP server is a transparent gateway, not a trust -> anchor. Gating mode is the spec-clean alternative when you don't have -> ClickHouse-side JWT validation set up. - -### Minimum config +1. Act as the **OAuth Authorization Server** to MCP clients (claude.ai, + ChatGPT, Codex) — hosts CIMD resolution, `/oauth/authorize`, + `/oauth/callback`, `/oauth/token`. +2. **Broker the upstream IdP** (Google, Azure AD, Keycloak, etc.) using a + static OAuth application credential you register there. +3. **Auto-detect the ClickHouse wire format** on the first authenticated + request to each endpoint and cache it — no `mode:` config needed. ```yaml config: @@ -233,96 +28,101 @@ config: server: oauth: enabled: true - mode: forward - signing_secret: "<32-byte-random>" # via env in production - issuer: "https://idp.example.com/" - auth_url: "https://idp.example.com/authorize" - token_url: "https://idp.example.com/oauth/token" + broker: true + signing_secret: "<32-byte-random>" # openssl rand -base64 32 + issuer: "https://mcp.example.com" + audience: "https://mcp.example.com" client_id: "" - # client_secret via MCP_OAUTH_CLIENT_SECRET env var - callback_path: "/oauth/callback" + client_secret: "" + auth_url: "" + token_url: "" + public_resource_url: "https://mcp.example.com" public_auth_server_url: "https://mcp.example.com" - public_resource_url: "https://mcp.example.com" - scopes: [openid, email, profile, offline_access] - upstream_offline_access: true + scopes: [openid, email, profile] ``` -## Gating mode +## ClickHouse authentication (auto-detected) -ClickHouse delegates the per-query password check to a colocated -`ch-jwt-verify` sidecar that cryptographically validates the JWT. +altinity-mcp supports two CH-side auth methods. With `broker: true` it +probes the endpoint on first use and caches the result — operators do not +need to configure which one is in use. -``` -+--------+ +-----------+ +----------+ +------------------+ -| MCP |----->| IdP | | MCP | | ClickHouse pod | -| Client |<-----| | | (forward)| | +--------------+ | -| | +-----------+ | | | | ClickHouse | | -| |--Bearer JWT------------>| | | | + http_ | | -| | | rewrite | | | auth | | -| | | to Basic |----->| | | | -| | | email:JWT| | | loopback | | -| | | | | | v | | -| | | | | | ch-jwt- | | -| | | | | | verify | | -| | | | | | (signature | | -| | | | | | + policy) | | -| |<------------------------|<---------|<-----| +--------------+ | -| | query results | | +------------------+ -+--------+ +----------+ -``` +### Bearer (`Authorization: Bearer `) -Flow: - -1. MCP client discovers the upstream IdP via RFC 9728 (MCP's - protected-resource doc points at the IdP, not MCP itself). -2. Client publishes a CIMD document and runs the auth-code flow against - the upstream IdP — MCP is invisible to that exchange. -3. Client presents the IdP-issued JWT on every MCP request. -4. MCP unverified-decodes the JWT's `email` claim (or a namespaced - `*/email` fallback for IdPs that strip top-level OIDC claims for - third-party clients), builds `Authorization: Basic base64(email:JWT)`, - and forwards the query to ClickHouse over HTTP. -5. ClickHouse's `` looks up the matching user - (provisioned as `IDENTIFIED WITH http SERVER 'ch_jwt_verify' SCHEME - 'BASIC'`) and POSTs the Basic header to the sidecar. -6. The sidecar validates signature (against the upstream JWKS), `iss` - (exact match), `aud` (RFC 8707 byte-equal), `exp`/`nbf`/`iat` (with - clock skew), required scopes, identity policy (verified email, - domain allow-lists), and **user-vs-claim match** (Basic user half - must match the JWT's `email` claim). On success it returns 200 with - optional session settings; any non-200 rejects the query. - -### Minimum config +Requires a ClickHouse build with `token_processors` (Altinity Antalya +25.8+). MCP forwards the upstream JWT directly; ClickHouse re-validates +against the upstream JWKS and materializes ephemeral users from claims. -```yaml -config: - clickhouse: - host: clickhouse.example.com - port: 8123 - protocol: http - # Static username/password are unused for OAuth requests — the - # per-request switch sets Auth.Username = email, Password = JWT. - # They remain as fallbacks for the (skipped) startup ping path. - username: default - server: - oauth: - enabled: true - mode: gating - signing_secret: "<32-byte-random>" # via env in production - issuer: "https://idp.example.com/" - jwks_url: "https://idp.example.com/.well-known/jwks.json" - audience: "https://mcp.example.com/" # RFC 8707 byte-equal target - required_scopes: [] # optional; empty disables the check - public_resource_url: "https://mcp.example.com" - # Forbidden under gating mode (startup refuses if any are set): - # client_id, client_secret, token_url, auth_url, userinfo_url, - # public_auth_server_url -``` +- No sidecar needed. +- Users are provisioned dynamically from JWT claims — no `CREATE USER` + per identity. +- Identity in `system.query_log` is the JWT subject (set + `email` to get readable names). + +### Basic (`Authorization: Basic base64(email:JWT)`) + +Works on any ClickHouse build. MCP unverified-decodes the JWT's `email` +claim and rewrites the credential to Basic form. ClickHouse's +[``](https://clickhouse.com/docs/operations/external-authenticators/http) +posts the Basic header to the colocated `ch-jwt-verify` sidecar, which +cryptographically validates signature, `iss`, `aud` (RFC 8707 byte-equal), +`exp`/`nbf`/`iat`, required scopes, identity policy, and the +user-vs-claim match. + +- Sidecar must be deployed next to ClickHouse (see + [`altinity-oauth-helper`](https://github.com/altinity/altinity-oauth-helper)). +- Users are pre-provisioned: `CREATE USER "alice@example.com" IDENTIFIED + WITH http SERVER 'ch_jwt_verify' SCHEME 'BASIC'`. +- Forces HTTP protocol on the driver. + +### Detection logic + +On the first authenticated request to `host:port`, MCP tries Bearer. If +CH returns an auth error (HTTP 401/403, CH exception codes 497/516/519), +it falls back to Basic. The result is stored in an in-memory cache keyed +by `host:port` and reused for all subsequent requests. The cache is +cleared on config reload. + +## MCP client discovery flow + +OAuth-capable MCP clients discover authentication automatically per +[RFC 9728](https://www.rfc-editor.org/rfc/rfc9728): + +1. Client `GET`s `/.well-known/oauth-protected-resource` from the MCP endpoint. +2. Response `authorization_servers` points to **MCP itself**. +3. Client fetches MCP's `/.well-known/oauth-authorization-server`, which + advertises CIMD support (no DCR `registration_endpoint`), lists + `grant_types_supported: ["authorization_code"]` and + `token_endpoint_auth_methods_supported: ["none", "private_key_jwt"]`. +4. Client publishes a CIMD document at a URL it controls and uses that + URL as its `client_id` at `/authorize`. +5. Client initiates the authorization-code flow with S256 PKCE. +6. After login, client exchanges the code for an access token. +7. Client presents the access token on every MCP request. + +## Requirements -### Sidecar + ClickHouse-side config +- **ClickHouse protocol**: HTTP (port 8123 typically). Both Bearer and + Basic routes use CH's HTTP interface; TCP/native has no equivalent. +- **ClickHouse build**: + - Bearer: Altinity Antalya 25.8+ or any CH build with `token_processors`. + - Basic: any build with `` and `IDENTIFIED WITH http` + (CH 24.x+ for OSS). Requires the `ch-jwt-verify` sidecar. +- **Identity Provider**: any OAuth 2.0 / OIDC-compliant IdP that supports + the authorization-code flow. +- **`signing_secret`**: required when `broker: true`. Symmetric secret + (≥ 32 bytes) for all stateless OAuth artifacts. Generate with + `openssl rand -base64 32`. +- **Reverse proxy**: if published behind a proxy, set `public_resource_url` + and `public_auth_server_url`. See + [Frontend / Reverse Proxy](#frontend--reverse-proxy-requirements). -The sidecar deploys as a colocated container in the CH pod (loopback -trust model). See +## Sidecar + ClickHouse-side config (Basic auth path) + +If your ClickHouse uses the `ch-jwt-verify` sidecar, altinity-mcp will +detect and use Basic auth automatically. No MCP-side config change needed. + +The sidecar deploys as a colocated container in the CH pod. See [`altinity-oauth-helper`](https://github.com/altinity/altinity-oauth-helper) for the full spec, helm chart, and wiring example. @@ -344,238 +144,33 @@ ClickHouse registers the sidecar via a `config.d/` XML drop-in: Per-user provisioning: ```sql --- One role per entitlement level CREATE ROLE IF NOT EXISTS mcp_reader; GRANT SELECT ON analytics.* TO mcp_reader; --- One user per identity, delegating auth to the sidecar CREATE USER `alice@example.com` IDENTIFIED WITH http SERVER 'ch_jwt_verify' SCHEME 'BASIC' DEFAULT ROLE mcp_reader; -GRANT mcp_reader TO `alice@example.com`; ``` The grammar token is `http`, not `http_authenticator` — ClickHouse rejects the latter with `SYNTAX_ERROR`. `SERVER 'ch_jwt_verify'` must match the `` block name. -### Identity policy - -All identity policy (verified-email, email-domain allow-listing, -hosted-domain allow-listing, user-vs-claim match) lives in the sidecar's -config: +Identity policy (verified-email, domain allow-listing, user-vs-claim +match) lives in the sidecar's config: ```yaml identity: username_claim: email - match_mode: lowercase_equal # or "exact" + match_mode: lowercase_equal require_email_verified: true - allowed_email_domains: ["example.com"] - allowed_hosted_domains: [] -``` - -Full schema in the [`altinity-oauth-helper`](https://github.com/altinity/altinity-oauth-helper) repo. - -MCP itself applies no identity policy in gating mode — the sidecar is -the only enforcer. - -### Limitations - -- **HTTP only** on the CH side: `` has no TCP - equivalent. -- **Sidecar must be reachable from the CH pod.** Colocated container is - the loopback-only production shape. Standalone Deployment + Service in - the same namespace works for shadow / smoke-test but expands the trust - boundary to the cluster network — lock down with a `NetworkPolicy` if - the namespace is multi-tenant. -- **No role forwarding from the IdP**: ClickHouse permissions come from - what's `GRANT`ed to the matched CH user. The sidecar can return - per-scope ClickHouse **session settings** via `settings_from_scope`, - but those are session settings only, not roles. - -## Refresh tokens - -Both modes can issue refresh tokens; the lifecycle is different. - -### Gating mode - -Refresh tokens are issued and rotated entirely by the upstream IdP — -MCP never sees them. The client exchanges refresh tokens directly -against the IdP's `/token` endpoint. - -Configure refresh-token rotation and reuse detection at the IdP itself -(RFC 6749 §10.4, OAuth 2.1 §4.13.2). MCP plays no role in this. - -### Forward mode (opt-in) - -By default, forward mode does not issue refresh tokens; MCP-client -sessions die when the upstream id_token expires. Set -`upstream_offline_access: true` to opt in: - -1. MCP appends `offline_access` (or the IdP-specific equivalent like - Google's `access_type=offline`) to the upstream authorize redirect. -2. MCP captures the upstream IdP's `refresh_token` from the - token-exchange response and wraps it in a JWE keyed by - `signing_secret`. The MCP client sees only the opaque JWE. -3. On `grant_type=refresh_token` from the MCP client, MCP decrypts the - JWE, calls the upstream `/oauth/token` with the upstream refresh, - re-validates the new id_token (signature via JWKS), and returns the - rotated pair. The new `access_token` is the fresh upstream id_token - verbatim. - -Operator setup: - -- Enable the `offline_access` scope on the IdP (Auth0: tenant API; Okta: - app grant types; Azure AD: scope exposure). Without IdP-side support, - the authorize redirect may hard-fail or silently strip the scope. -- Configure refresh-token rotation + reuse detection at the IdP. This - provides revocation outside MCP, since the JWE itself is stateless. -- Default is `false` so existing deployments are unaffected unless an - operator opts in. Turning on refresh widens the stolen-token blast - radius from the upstream id_token TTL (~1 h) to - `refresh_token_ttl_seconds` (default 30 d). - -### Revocation limitations - -- **Gating**: no MCP-side revocation; token validity is bounded by the - IdP's access-token TTL. Grant revocations take effect within one TTL - window. The sidecar's `cache.positive_ttl` (default 30 s) adds a - small additional window per validated token. -- **Forward**: no server-side revocation of the JWE-wrapped refresh - token. Rotate `signing_secret` to invalidate all outstanding JWEs. - The upstream IdP's reuse detection (if enabled) provides - defense-in-depth. - -## Full configuration reference - -```yaml -server: - oauth: - # Enable OAuth 2.0 authentication - enabled: false - - # OAuth operating mode: - # - "gating": pure resource server. Upstream IdP handles - # /authorize, /token, and refresh-token rotation. - # MCP forwards Authorization: Basic base64(email:JWT) - # to ClickHouse; the ch-jwt-verify sidecar gates. - # - "forward": MCP brokers DCR/CIMD + authorize + token against - # the upstream IdP; relays the upstream JWT to - # ClickHouse as Bearer for token_processors to - # validate. - mode: "gating" - - # Symmetric secret for stateless OAuth artifacts (authorization - # codes, refresh-token JWEs, HKDF-derived signing material). - # Required whenever OAuth is enabled. Minimum 32 bytes — generate - # with `openssl rand -base64 32`. - signing_secret: "" - - # Upstream OAuth/OIDC issuer URL (used for discovery + validation) - issuer: "" - - # URL to fetch JWKS for token validation (discovered from issuer - # if omitted) - jwks_url: "" - - # Expected audience claim in incoming tokens - # (RFC 8707 byte-equality; trailing slash matters) - audience: "" - - # Forward mode only — upstream OAuth client credentials and - # endpoint URLs. FORBIDDEN in gating mode (startup refuses). - client_id: "" - client_secret: "" - token_url: "" - auth_url: "" - userinfo_url: "" - public_auth_server_url: "" - - # Forward mode only — OAuth scopes to request from the upstream IdP - scopes: ["openid", "profile", "email"] - - # Forward mode only — opt into requesting offline_access upstream - # and issuing JWE-wrapped refresh tokens to MCP clients. - upstream_offline_access: false - - # Scopes required in every incoming bearer JWT. Enforced by the - # ch-jwt-verify sidecar (gating) or token_processors (forward); - # MCP itself does not validate per-request. - required_scopes: [] - - # Token lifetimes - access_token_ttl_seconds: 3600 # forward-mode only; gating is IdP-managed - refresh_token_ttl_seconds: 2592000 # forward-mode only; 30 d default - - # Externally visible MCP endpoint URL. Required behind a reverse - # proxy (both modes). - public_resource_url: "" - - # Forward mode only — endpoint paths (defaults shown). - registration_path: "/register" - authorization_path: "/authorize" - callback_path: "/callback" - token_path: "/token" -``` - -### Key options - -| Option | Description | -|---|---| -| `mode` | `gating` (MCP as pure forwarder, sidecar gates) or `forward` (MCP as AS broker, `token_processors` gates) | -| `signing_secret` | Symmetric secret for stateless OAuth artifacts. **Required** whenever OAuth is enabled. ≥ 32 bytes. | -| `issuer` | Upstream IdP issuer URL for OIDC discovery and token validation | -| `audience` | RFC 8707 byte-equal target. Must match the JWT's `aud` claim byte-for-byte (trailing slashes count). | -| `public_resource_url` | Externally visible MCP endpoint URL. **Required** behind a reverse proxy | -| `public_auth_server_url` | Externally visible OAuth authorization server URL. **Forward only** — required behind a reverse proxy. Forbidden in gating. | -| `upstream_offline_access` | Forward only: request `offline_access` upstream and issue JWE-wrapped refresh tokens to MCP clients. Default `false` | - -## Frontend / Reverse Proxy Requirements - -For direct bearer-token use, a plain reverse proxy is usually enough. - -For browser-based MCP login in **forward mode**, the frontend must -expose two public URL spaces: - -- the protected resource, e.g. `https://mcp.example.com/` -- the OAuth authorization server, e.g. `https://mcp.example.com/oauth` - -In **gating mode**, only the protected resource needs to be proxied — -the authorization server is the upstream IdP. - -The proxy must: - -- Forward `Host` and `Authorization` headers unchanged -- Disable response buffering for MCP streaming -- Disable request buffering for long-lived POSTs -- Keep long read/send timeouts -- Not normalize or rewrite the configured callback or metadata paths -- Not rely on forwarded-prefix headers — configure the public OAuth - URLs explicitly in `altinity-mcp` - -Example nginx fragment: - -```nginx -location / { - proxy_http_version 1.1; - proxy_set_header Host $host; - proxy_set_header Authorization $http_authorization; - proxy_buffering off; - proxy_request_buffering off; - proxy_read_timeout 3600; - proxy_send_timeout 3600; - proxy_pass http://ALTINITY_MCP_UPSTREAM; -} + allowed_email_domains: ["example.com"] ``` -If the IdP reports `redirect_uri_mismatch`, verify the public callback -URL the browser sees exactly matches the URI registered at the IdP. - -## ClickHouse `token_processors` (forward mode) +## ClickHouse `token_processors` (Bearer auth path) -Forward mode requires ClickHouse to be configured with `token_processors` -plus a `` entry in `` that maps validated -tokens to ClickHouse users. +If your ClickHouse uses `token_processors`, altinity-mcp will detect +Bearer auth automatically. No MCP-side config change needed. ### Generic OIDC (Keycloak, etc.) @@ -599,26 +194,8 @@ tokens to ClickHouse users. ``` -You can also specify endpoints explicitly: - -```xml - - - - OpenID - https://idp.example.com/userinfo - https://idp.example.com/certs - https://idp.example.com/token/introspect - 60 - - - -``` - ### Azure AD (Microsoft Entra ID) -Azure has a dedicated `azure` type that requires no explicit endpoints: - ```xml @@ -638,59 +215,120 @@ Azure has a dedicated `azure` type that requires no explicit endpoints: ``` -### Roles setup - -You must create the roles referenced in `common_roles` before users can -authenticate: +Roles must exist before users can authenticate: ```sql CREATE ROLE OR REPLACE default_role; GRANT SELECT ON default.* TO default_role; ``` -The default `` is `sub` — IdP users show up in -`system.processes` / `system.query_log` as numeric IDs. To attribute -queries by email, set `email` on the -processor. +The default `` is `sub` — IdP users appear in +`system.processes` as numeric IDs. Set +`email` to attribute queries by email. -## Provider-specific setup +## Full configuration reference + +```yaml +server: + oauth: + # Enable OAuth 2.0 authentication + enabled: false + + # Canonical broker flag: MCP acts as AS to MCP clients (CIMD + /authorize + # + /callback + /token) and brokers the upstream IdP. CH auth format + # (Bearer vs Basic) is auto-detected on first request per endpoint. + broker: true + + # Symmetric secret for stateless OAuth artifacts (authorization codes, + # JWE-wrapped state, HKDF-derived signing material). Required when + # broker: true. Minimum 32 bytes — generate with `openssl rand -base64 32`. + signing_secret: "" + + # Upstream OAuth/OIDC issuer URL (used for discovery and token validation) + issuer: "" + + # URL to fetch JWKS for token validation (discovered from issuer if omitted) + jwks_url: "" + + # Expected audience claim in incoming tokens + # (RFC 8707 byte-equality; trailing slash matters) + audience: "" + + # Upstream OAuth client credentials. Required when broker: true. + client_id: "" + client_secret: "" + token_url: "" + auth_url: "" + userinfo_url: "" + + # OAuth scopes to request from the upstream IdP + scopes: ["openid", "profile", "email"] + + # Append offline_access to the upstream authorize scope so the IdP + # consent screen offers long-lived sessions. v1 does NOT issue downstream + # refresh tokens — clients re-authorize via /oauth/authorize on expiry. + upstream_offline_access: false + + # Scopes required in every incoming bearer JWT + required_scopes: [] + + # Token lifetimes (broker mode) + access_token_ttl_seconds: 3600 + refresh_token_ttl_seconds: 2592000 # 30 d -The OAuth setup for the IdP itself is provider-specific. The examples -below cover the common providers. Substitute the relevant URLs into the -[forward-mode config](#forward-mode) or -[gating-mode config](#gating-mode) above. + # Externally visible MCP endpoint URL. Required behind a reverse proxy. + public_resource_url: "" + + # Externally visible OAuth authorization server URL. Required behind + # a reverse proxy when broker: true. + public_auth_server_url: "" + + # Endpoint paths (defaults shown) + authorization_path: "/authorize" + callback_path: "/callback" + token_path: "/token" +``` + +### Key options + +| Option | Description | +|---|---| +| `broker` | `true`: MCP acts as the OAuth AS, brokers upstream IdP, auto-detects CH auth format. | +| `signing_secret` | Symmetric HKDF master secret for OAuth JWE artifacts. **Required** when `broker: true`. ≥ 32 bytes. | +| `issuer` | Upstream IdP issuer URL for OIDC discovery and token validation. | +| `audience` | RFC 8707 byte-equal target. Must match the JWT's `aud` claim byte-for-byte (trailing slashes count). | +| `public_resource_url` | Externally visible MCP endpoint URL. **Required** behind a reverse proxy. | +| `public_auth_server_url` | Externally visible OAuth AS URL. **Required** behind a reverse proxy when `broker: true`. | +| `upstream_offline_access` | Request `offline_access` upstream so the IdP consent screen offers long-lived sessions. Default `false`. | + +## Provider-specific setup ### Keycloak -1. **Create a realm and client** in the Keycloak admin console: - - Realm: e.g. `mcp` - - Client ID: `clickhouse-mcp` +1. Create a realm and client in the Keycloak admin console: - Client Protocol: `openid-connect` - - Access Type: `confidential` (or `public` for PKCE) + - Access Type: `confidential` - Valid Redirect URIs: your MCP server's `/oauth/callback` - - Enable "Standard Flow" and "Direct Access Grants" + - Enable "Standard Flow" -2. **Create groups + users** for role mapping (configure a group - membership mapper on the client so groups land in the token). - -3. **Forward-mode MCP config:** +2. MCP config: ```yaml server: oauth: enabled: true - mode: forward + broker: true signing_secret: "<32-byte-random>" issuer: "http://keycloak:8080/realms/mcp" - auth_url: "http://keycloak:8080/realms/mcp/protocol/openid-connect/auth" - token_url: "http://keycloak:8080/realms/mcp/protocol/openid-connect/token" audience: "clickhouse-mcp" client_id: "clickhouse-mcp" - client_secret: "" # via env in prod + client_secret: "" + auth_url: "http://keycloak:8080/realms/mcp/protocol/openid-connect/auth" + token_url: "http://keycloak:8080/realms/mcp/protocol/openid-connect/token" scopes: ["openid", "email"] ``` -4. **ClickHouse `token_processors`** (forward mode): +3. ClickHouse `token_processors` (if using Bearer): ```xml @@ -703,70 +341,61 @@ below cover the common providers. Substitute the relevant URLs into the ``` -See [zvonand/grafana-oauth](https://github.com/zvonand/grafana-oauth) -for a complete working example with Keycloak and ClickHouse. +See [zvonand/grafana-oauth](https://github.com/zvonand/grafana-oauth) for +a complete working example with Keycloak and ClickHouse. ### Azure AD (Microsoft Entra ID) -1. **Register an application** in the [Azure Portal](https://portal.azure.com): +1. Register an application in the [Azure Portal](https://portal.azure.com): - Microsoft Entra ID → App registrations → New registration - Add a redirect URI: your MCP `/oauth/callback` + - Create a client secret under Certificates & secrets. + - Configure API permissions: `openid`, `profile`, `email`. -2. **Create a client secret** under Certificates & secrets. - -3. **Configure API permissions**: `openid`, `profile`, `email`. - -4. **Note endpoints** (from app overview): - - Tenant ID, Application (Client) ID - - Token URL: `https://login.microsoftonline.com/{TENANT_ID}/oauth2/v2.0/token` - - Auth URL: `https://login.microsoftonline.com/{TENANT_ID}/oauth2/v2.0/authorize` - - OIDC discovery: `https://login.microsoftonline.com/{TENANT_ID}/v2.0/.well-known/openid-configuration` - -5. **Forward-mode MCP config:** +2. MCP config: ```yaml server: oauth: enabled: true - mode: forward + broker: true signing_secret: "<32-byte-random>" issuer: "https://login.microsoftonline.com//v2.0" audience: "" client_id: "" - client_secret: "" # via env + client_secret: "" token_url: "https://login.microsoftonline.com//oauth2/v2.0/token" auth_url: "https://login.microsoftonline.com//oauth2/v2.0/authorize" scopes: ["openid", "profile", "email"] ``` See [zvonand/grafana-oauth/azure](https://github.com/zvonand/grafana-oauth/tree/main/azure) -for a complete working example with Azure AD and ClickHouse. +for a complete working example. ### Google Cloud Identity -1. **Create OAuth 2.0 credentials** in the [Google Cloud Console](https://console.cloud.google.com) - under APIs & Services → Credentials → OAuth client ID → Web - application. Set the authorized redirect URI to - `/oauth/callback`. +1. Create OAuth 2.0 credentials in the [Google Cloud Console](https://console.cloud.google.com) + under APIs & Services → Credentials → OAuth client ID → Web application. + Set the authorized redirect URI to `/oauth/callback`. -2. **Forward-mode MCP config:** +2. MCP config: ```yaml server: oauth: enabled: true - mode: forward + broker: true signing_secret: "<32-byte-random>" issuer: "https://accounts.google.com" audience: ".apps.googleusercontent.com" client_id: ".apps.googleusercontent.com" - client_secret: "" # via env + client_secret: "" token_url: "https://oauth2.googleapis.com/token" auth_url: "https://accounts.google.com/o/oauth2/v2/auth" scopes: ["openid", "profile", "email"] ``` -3. **ClickHouse `token_processors`** (forward mode): +3. ClickHouse `token_processors` (if using Bearer): ```xml @@ -780,40 +409,32 @@ for a complete working example with Azure AD and ClickHouse. ``` References: [Google OpenID Connect](https://developers.google.com/identity/openid-connect/openid-connect), -[Using OAuth 2.0 to Access Google APIs](https://developers.google.com/identity/protocols/oauth2), -[Setting up OAuth 2.0 in Google Cloud Console](https://support.google.com/googleapi/answer/6158849). +[Using OAuth 2.0 to Access Google APIs](https://developers.google.com/identity/protocols/oauth2). ### AWS Cognito -1. **Create a user pool** in the [AWS Console](https://console.aws.amazon.com/cognito). - Configure sign-in (email/username), password policies, and add an - App Client with OAuth 2.0 enabled (Authorization Code grant, scopes - `openid` `profile` `email`, callback URL `/oauth/callback`). +1. Create a user pool in the [AWS Console](https://console.aws.amazon.com/cognito) + with Authorization Code grant, scopes `openid profile email`, callback + URL `/oauth/callback`. -2. **Note endpoints:** - - Issuer URL: `https://cognito-idp.{REGION}.amazonaws.com/{USER_POOL_ID}` - - Token URL: `https://{DOMAIN}.auth.{REGION}.amazoncognito.com/oauth2/token` - - Auth URL: `https://{DOMAIN}.auth.{REGION}.amazoncognito.com/oauth2/authorize` - - OIDC discovery: `https://cognito-idp.{REGION}.amazonaws.com/{USER_POOL_ID}/.well-known/openid-configuration` - -3. **Forward-mode MCP config:** +2. MCP config: ```yaml server: oauth: enabled: true - mode: forward + broker: true signing_secret: "<32-byte-random>" issuer: "https://cognito-idp..amazonaws.com/" audience: "" client_id: "" - client_secret: "" # via env + client_secret: "" token_url: "https://.auth..amazoncognito.com/oauth2/token" auth_url: "https://.auth..amazoncognito.com/oauth2/authorize" scopes: ["openid", "profile", "email"] ``` -4. **ClickHouse `token_processors`** (forward mode): +3. ClickHouse `token_processors` (if using Bearer): ```xml @@ -825,15 +446,10 @@ References: [Google OpenID Connect](https://developers.google.com/identity/openi ``` -References: [Amazon Cognito - OIDC IdPs](https://docs.aws.amazon.com/cognito/latest/developerguide/cognito-user-pools-oidc-idp.html), -[How to use OAuth 2.0 in Amazon Cognito](https://aws.amazon.com/blogs/security/how-to-use-oauth-2-0-in-amazon-cognito-learn-about-the-different-oauth-2-0-grants/), -[Cognito IdP endpoints](https://docs.aws.amazon.com/cognito/latest/developerguide/federation-endpoints.html). +References: [Amazon Cognito - OIDC IdPs](https://docs.aws.amazon.com/cognito/latest/developerguide/cognito-user-pools-oidc-idp.html). ## Helm chart deployment -The chart at [`helm/altinity-mcp/`](../helm/altinity-mcp/) supports all -OAuth options under `config.server.oauth`. Example values files: - ```bash helm install altinity-mcp ./helm/altinity-mcp \ -f helm/altinity-mcp/values_examples/mcp-oauth-keycloak.yaml @@ -843,42 +459,68 @@ helm install altinity-mcp ./helm/altinity-mcp \ - `values_examples/mcp-oauth-azure.yaml` — Azure AD - `values_examples/mcp-oauth-google.yaml` — Google Cloud Identity -For gating mode, also deploy the sidecar from +For the `ch-jwt-verify` sidecar path, also deploy from [`altinity-oauth-helper`](https://github.com/altinity/altinity-oauth-helper) into the ClickHouse pod. +## Frontend / Reverse Proxy Requirements + +The proxy must expose two public URL spaces: + +- the protected resource, e.g. `https://mcp.example.com/` +- the OAuth authorization server, e.g. `https://mcp.example.com/oauth` + +The proxy must: + +- Forward `Host` and `Authorization` headers unchanged +- Disable response buffering for MCP streaming +- Disable request buffering for long-lived POSTs +- Keep long read/send timeouts +- Not normalize or rewrite the configured callback or metadata paths + +Example nginx fragment: + +```nginx +location / { + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header Authorization $http_authorization; + proxy_buffering off; + proxy_request_buffering off; + proxy_read_timeout 3600; + proxy_send_timeout 3600; + proxy_pass http://ALTINITY_MCP_UPSTREAM; +} +``` + +If the IdP reports `redirect_uri_mismatch`, verify the public callback +URL the browser sees exactly matches the URI registered at the IdP. + ## Security considerations -- **`signing_secret`** protects all stateless OAuth artifacts (forward- - mode authorization codes, refresh-token JWEs, HKDF-derived signing - material). Treat it like a signing key. Rotate it to invalidate all - outstanding tokens. -- **MCP no longer holds a per-tenant ClickHouse credential.** In gating - mode there is no shared cluster secret. In forward mode the bearer is - the upstream IdP's JWT, validated end-to-end by `token_processors`. -- **Gating-mode tokens are upstream-IdP-issued.** MCP does not mint or - revoke them. Revocation propagates within the IdP's access-token TTL - window + the sidecar's positive-cache TTL (default 30 s). +- **`signing_secret`** protects all stateless OAuth artifacts. Treat it + like a signing key. Rotate it to invalidate all outstanding tokens. +- **MCP holds no per-tenant ClickHouse credential.** The bearer is the + upstream IdP's JWT; validation happens at ClickHouse (either + `token_processors` or the `ch-jwt-verify` sidecar). - **Opaque bearer tokens are not supported.** Inbound OAuth requires a signed JWT validatable via JWKS. RFC 7662 introspection is not implemented. -- **Token preference during browser login.** When both `id_token` and - `access_token` come back from the upstream, `altinity-mcp` prefers - `id_token` as the MCP bearer and falls back to `access_token` only - when no `id_token` is available. - **CIMD is the only inbound registration model.** Dynamic Client Registration (DCR, RFC 7591) is intentionally not exposed — - `/oauth/register` returns HTTP 410 Gone with an RFC 7591 §3.2.2 JSON - error pointing at the CIMD spec. + `/oauth/register` returns HTTP 410 Gone. +- **Token preference.** When both `id_token` and `access_token` come back + from the upstream, altinity-mcp prefers `id_token` and falls back to + `access_token` only when no `id_token` is available. ## Troubleshooting ### ClickHouse returns HTTP 403 with `Bearer HTTP Authorization scheme is not supported` -Forward mode without `token_processors`. The build is OSS-only or -otherwise lacks JWT auth. Either switch to gating mode (deploy the -sidecar; see [`altinity-oauth-helper`](https://github.com/altinity/altinity-oauth-helper)) or upgrade to a -ClickHouse build with `token_processors` (Altinity Antalya 25.8+). +The CH build does not support `token_processors`. altinity-mcp will +automatically fall back to Basic auth if the `ch-jwt-verify` sidecar is +running. If neither is configured, upgrade to Altinity Antalya 25.8+ or +deploy [`altinity-oauth-helper`](https://github.com/altinity/altinity-oauth-helper). ### Token validation fails with `issuer mismatch` @@ -889,37 +531,30 @@ ClickHouse build with `token_processors` (Altinity Antalya 25.8+). - Missing `/v2.0` suffix for Azure AD. - Wrong realm path for Keycloak. -In gating mode, `issuer` must exactly match the `iss` claim in the -AS-issued JWT. `public_auth_server_url` is forward-mode-only — startup -refuses it under gating. +### `oauth: bearer is not a JWT with an email claim` + +The JWT's top-level `email` claim is missing. Some IdPs strip standard +OIDC claims for third-party clients. Configure a post-login action that +injects `https:///email` into the access token — +altinity-mcp reads any claim with a `/email` suffix as a fallback. ### ClickHouse authenticates but the user has no permissions -Create the roles referenced in `common_roles` (forward) or -`DEFAULT ROLE` (gating) and grant them the necessary permissions: +Create the roles and grant them the necessary permissions: ```sql CREATE ROLE OR REPLACE default_role; GRANT SELECT ON *.* TO default_role; ``` -### Gating-mode error `oauth gating: bearer is not a JWT with an email claim` - -The IdP stripped the top-level `email` claim from the access token. -Some IdPs (Auth0 enhanced-security DCR is one) strip standard OIDC -claims for third-party clients unless they're set under a namespaced -URL claim. Configure a post-login action that injects -`https:///email` (or similar) into the access token; -`altinity-mcp` reads any claim with a `/email` suffix as a fallback for -the top-level `email`. +For the sidecar path, also verify the user exists: +`CREATE USER "alice@example.com" IDENTIFIED WITH http SERVER 'ch_jwt_verify' SCHEME 'BASIC'`. ### `block decode for exception: unexpected value 10 for boolean` -The clickhouse-go driver received text where it expected native binary -blocks. Common cause: a `FORMAT JSON` (or `FORMAT TSV`, etc.) suffix in -the SQL — the driver speaks native binary over HTTP and the format -override makes ClickHouse return text. Drop the `FORMAT` clause; MCP -serializes results to JSON for the LLM itself. +A `FORMAT JSON` (or similar) suffix in the SQL — the driver speaks native +binary over HTTP and the format override makes ClickHouse return text. +Drop the `FORMAT` clause. ### More troubleshooting @@ -927,3 +562,4 @@ For sidecar-specific errors (JWKS rotation, audience byte-equality, sidecar binding gotchas) see the [`altinity-oauth-helper`](https://github.com/altinity/altinity-oauth-helper) repo's troubleshooting section. + diff --git a/docs/oauth_troubleshooting.md b/docs/oauth_troubleshooting.md index 6bd56181..7312dbaa 100644 --- a/docs/oauth_troubleshooting.md +++ b/docs/oauth_troubleshooting.md @@ -1,6 +1,7 @@ # OAuth Troubleshooting Guide -This document covers common issues encountered when deploying OAuth forward mode with ClickHouse `token_processors`, based on real-world debugging experience. +This document covers common issues encountered when deploying OAuth with +altinity-mcp and ClickHouse. > **claude.ai JSX artifact users:** if your connector works in the main chat > but not from a JSX artifact (`✗ No tools attached — proxy didn't expose this @@ -46,15 +47,20 @@ DROP USER `user@example.com`; **Checklist:** -- The server is running in `mode: forward` (token forwarding to ClickHouse is automatic in this mode) -- The MCP client completed the OAuth flow — check MCP server logs for register, authorize, callback, and token exchange activity +- The server has `oauth.broker: true` and the MCP client completed the OAuth flow +- Check MCP server logs for authorize, callback, and token exchange activity - The MCP client is not reusing a stale token from a previous session -## Forward mode does not require static ClickHouse credentials +## OAuth broker does not require static ClickHouse credentials -In forward mode, each request forwards the OAuth bearer token to ClickHouse instead of using static credentials. The MCP server health check skips the ClickHouse ping in this mode (it reports `"auth": "per_request_credentials"` in the health response). +With `broker: true`, each request carries the OAuth bearer token to ClickHouse +instead of static credentials. The MCP server health check skips the ClickHouse +ping in this mode (it reports `"auth": "per_request_credentials"` in the health +response). -Any `username`/`password` in the ClickHouse config section is ignored for query execution in forward mode. If you see startup authentication errors, verify that the MCP server is not attempting a ClickHouse connection with empty or default credentials. +Any `username`/`password` in the ClickHouse config section is ignored for query +execution. If you see startup authentication errors, verify that the MCP server +is not attempting a ClickHouse connection with empty or default credentials. ## `username_claim` in token_processors defaults to `sub` @@ -79,13 +85,17 @@ Any `username`/`password` in the ClickHouse config section is ignored for query **Symptom:** The MCP server logs show occasional `ClickHouse ping failed during connection` errors with `invalid token supplied` from ClickHouse, even though queries succeed for authenticated users. -**Root cause:** An MCP client (e.g., Claude Desktop, Claude.ai) is retrying with a cached or stale token from a previous session. In forward mode, the MCP server does not validate tokens locally — it passes them through to ClickHouse, which rejects invalid tokens. +**Root cause:** An MCP client (e.g., Claude Desktop, Claude.ai) is retrying with a cached or stale token from a previous session. The MCP server passes the token through to ClickHouse, which rejects invalid tokens. -**Resolution:** These errors are transient and resolve once the client re-authenticates through the OAuth flow. They do not indicate a server-side configuration issue. If errors persist, ask the client to clear cached credentials for the MCP server and re-authenticate. +**Resolution:** These errors are transient and resolve once the client re-authenticates. They do not indicate a server-side configuration issue. If errors persist, ask the client to clear cached credentials for the MCP server and re-authenticate. ## ClickHouse returns HTTP 403 with "Bearer HTTP Authorization scheme is not supported" -The ClickHouse build does not support `token_processors`. Forward mode requires the Altinity Antalya build 25.8+ or a compatible ClickHouse version that supports `token_processors` and `user_directories` token authentication. +The ClickHouse build does not support `token_processors`. altinity-mcp will +automatically probe and fall back to Basic auth if the `ch-jwt-verify` sidecar +is deployed next to ClickHouse. If neither Bearer nor Basic succeeds, upgrade +to Altinity Antalya 25.8+ or deploy +[`altinity-oauth-helper`](https://github.com/altinity/altinity-oauth-helper). ## Useful diagnostic queries diff --git a/docs/proposal_gated_user_mapping.md b/docs/proposal_gated_user_mapping.md deleted file mode 100644 index d2e125f3..00000000 --- a/docs/proposal_gated_user_mapping.md +++ /dev/null @@ -1,394 +0,0 @@ -# Gating Mode with Real ClickHouse Authentication - -## Motivation - -Current gating mode validates OAuth tokens at the MCP proxy layer but connects -to ClickHouse with a single static set of credentials. This means: - -- No per-user audit trail in `system.query_log` — every query shows the same - ClickHouse user. -- No per-group access control — the MCP server decides what's allowed, not - ClickHouse grants. - -Using ClickHouse's [HTTP external authenticator](https://clickhouse.com/docs/operations/external-authenticators/http) -we can get real ClickHouse authentication in gating mode, with standard -(non-Antalya) ClickHouse builds. - -## Design - -### IdP group → ClickHouse user mapping - -Pre-create a ClickHouse user per IdP group. No ClickHouse roles are needed — -each user already has the appropriate grants. MCP maps the first matching IdP -group from the OAuth token to a ClickHouse user. - -Groups in the mapping are domain-qualified (`group.domain`) to support a -single IdP serving multiple organisations. MCP constructs the FQDN at runtime -by combining the group name from the token with a domain resolved from the -user's identity. - -```yaml -oauth: - group_claim: "groups" # JWT claim containing group list - group_domain_claim: "hd" # claim to use as domain (e.g. Google hd) - group_user_mapping: - engineering.altinity.com: "ch_engineering" - analytics.partner.com: "ch_analytics" - admin.altinity.com: "ch_admin" - default_user: "" # empty = reject if no group matches -``` - -**Domain resolution order:** -1. `group_domain_claim` — value of the configured claim (e.g. `hd: "altinity.com"`) -2. Email domain — extracted from `email` claim (`alice@altinity.com` → `altinity.com`) -3. If neither available — reject (cannot construct qualified group name) - -**Example:** Token contains `groups: ["engineering", "admin"]` and -`hd: "altinity.com"`. MCP constructs `["engineering.altinity.com", -"admin.altinity.com"]` and matches against `group_user_mapping` keys. - -A user from `partner.com` with `groups: ["analytics"]` and -`hd: "partner.com"` constructs `analytics.partner.com` → maps to -`ch_analytics`. Same IdP, different domain, different ClickHouse user. - -**Multi-group priority**: first match in config order wins. - -**No matching group**: returns 403 to the MCP client unless `default_user` is -set. - -### ClickHouse configuration - -Define the HTTP auth server and the users that delegate to it: - -```xml - - - - http://altinity-mcp:8080/auth/callback - 1000 - 1000 - 1000 - 1 - - - - - - - mcp_auth - basic - - - - - - - mcp_auth - basic - - - - - - mcp_auth - basic - - - - -``` - -`max_tries` is set to 1 — see [Replay and retries](#replay-and-retries) below. - -### Request flow - -``` - MCP Client MCP Server ClickHouse - ========== ========== ========== - │ │ │ - │ (1) MCP request │ │ - │ + OAuth token │ │ - │─────────────────────────▶│ │ - │ │ │ - │ ┌─────┴─────┐ │ - │ │ (2) Verify │ │ - │ │ token │ │ - │ │ (3) Check │ │ - │ │ policy │ │ - │ │ (4) Extract│ │ - │ │ groups │ │ - │ │ (5) Map to │ │ - │ │ CH user │ │ - │ │ (6) Gen │ │ - │ │ nonce │ │ - │ └─────┬─────┘ │ - │ │ │ - │ │ (7) SQL query │ - │ │ Authorization: Basic │ - │ │ user=ch_engineering │ - │ │ pass= │ - │ │ + log_comment header │ - │ │──────────────────────────▶│ - │ │ │ - │ │ (8) HTTP callback │ - │ │ POST /auth/callback │ - │ │ Authorization: Basic│ - │ │ (same creds) │ - │ │◀──────────────────────────│ - │ │ │ - │ ┌─────┴─────┐ │ - │ │ (9) Lookup │ │ - │ │ nonce, │ │ - │ │ consume, │ │ - │ │ 200/401 │ │ - │ └─────┬─────┘ │ - │ │ │ - │ │ 200 OK or 401 │ - │ │──────────────────────────▶│ - │ │ │ - │ │ (10) Query result │ - │ │◀──────────────────────────│ - │ │ │ - │ (11) Response │ │ - │◀─────────────────────────│ │ - │ │ │ - - Nonce lifecycle: - ┌──────────────────────────────────────────────────────┐ - │ Created at step 6 map[nonce] → {user, email, exp}│ - │ Consumed at step 9 deleted on first callback hit │ - │ TTL safety net expired nonces swept if unused │ - └──────────────────────────────────────────────────────┘ -``` - -**Step details:** - -1. MCP client sends request with OAuth access token. -2. MCP validates token (signature, expiry, issuer, audience). -3. MCP applies identity policy (domain allowlist, email verification). -4. MCP extracts groups from token via configured `group_claim`. -5. MCP resolves domain (`group_domain_claim` → email domain fallback), - constructs `group.domain` FQDNs, maps first match to a ClickHouse user - via `group_user_mapping`. If no match and no `default_user`, return 403. -6. MCP generates a single-use nonce, stores `nonce → {ch_user, email, expires_at}` - in an in-memory map with a short TTL (e.g. 10 seconds). -7. MCP sends the query to ClickHouse as HTTP basic auth: - `user=ch_engineering`, `password=`. - Also sets `X-ClickHouse-Setting-log_comment` to the real identity — see - [Audit trail](#audit-trail-in-systemquery_log). -8. ClickHouse calls back to `http://altinity-mcp:8080/auth/callback` with the - same basic auth credentials. -9. MCP looks up the nonce in the map. If found and not expired, consumes it - (delete from map) and returns 200. Otherwise returns 401. -10. ClickHouse executes or rejects the query based on the callback response. -11. MCP returns the query result to the client. - -### Replay and retries - -The nonce is consumed on first callback hit (delete from map). This makes -replay impossible — a second request with the same nonce gets 401. - -This conflicts with ClickHouse's `max_tries` retry behaviour: if the first -callback succeeds but the connection drops before ClickHouse reads the -response, CH retries — and the nonce is already consumed. - -Setting `max_tries=1` avoids this. The CH→MCP call is in-cluster (same k8s -namespace, no TLS needed), so transient failures are rare. If the callback -does fail, the query fails, and the MCP client retries the whole flow at the -application level. - -The nonce TTL (10 seconds) is a safety net for callbacks that never arrive -(ClickHouse crash, network partition). Expired nonces are lazily evicted or -swept periodically. - -### Audit trail in `system.query_log` - -Since multiple real users share the same ClickHouse user (e.g. all engineers -connect as `ch_engineering`), the `user` column alone doesn't identify who -ran a query. Two approaches to inject real identity: - -**Option A: `log_comment` (recommended)** - -MCP sets the `X-ClickHouse-Setting-log_comment` HTTP header on every query: - -``` -X-ClickHouse-Setting-log_comment: alice@corp.com -``` - -This populates the dedicated `system.query_log.log_comment` column: - -```sql -SELECT event_time, user, log_comment AS real_user, query -FROM system.query_log -WHERE type = 'QueryFinish' -ORDER BY event_time DESC -``` - -**Option B: custom setting** - -Use `X-ClickHouse-Setting-custom_oauth_email` instead, which populates -`system.query_log.Settings['custom_oauth_email']`. This keeps `log_comment` -free for other uses but requires querying the `Settings` map column. - -Both approaches use the existing ClickHouse HTTP settings header mechanism — -no ClickHouse-side configuration needed. - -## MCP server changes - -1. **New config fields**: `group_claim`, `group_user_mapping`, `default_user`, - `auth_callback_path`. -2. **New HTTP endpoint**: `/auth/callback` — accepts basic auth, looks up nonce, - returns 200 or 401. -3. **Nonce store**: in-memory map with TTL-based expiry. Stateless — if MCP - restarts, in-flight queries fail (clients retry). Acceptable tradeoff for - no persistent storage. -4. **Query dispatch**: in gating mode with `group_user_mapping` configured, - override ClickHouse username/password with the mapped user and nonce. Set - `log_comment` header with real identity. -5. **Existing gating mode**: unchanged when `group_user_mapping` is not - configured. This is an opt-in enhancement. - -## Comparison with forward mode (Antalya) - -| | Gating + HTTP auth callback | Forward mode (Antalya) | -|---|---|---| -| ClickHouse build | Standard | Antalya (token_processors) | -| User creation | Pre-created per group | Automatic via user_directories | -| Role mapping | Via CH grants on pre-created users | roles_filter + roles_transform | -| Identity in query_log | log_comment header | user column (username_claim) | -| Token lifecycle | MCP-controlled (nonce per query) | IdP-controlled (token passthrough) | - -## Group claims across Identity Providers - -The design above assumes groups are available in the OAuth token. There is -**no standard OIDC claim for groups** — each IdP handles this differently, -and some don't support it at all. - -### Summary - -| IdP | Claim name | Type | In token by default? | Config required | -|-----|-----------|------|---------------------|-----------------| -| Google Workspace | N/A | N/A | **No — not in tokens** | Groups require Directory API | -| Auth0 | custom namespaced | `[]string` | No | Action required | -| Okta | `groups` | `[]string` (names) | No | Custom claim in auth server | -| Keycloak | `groups` | `[]string` (names/paths) | No | Group Membership mapper | -| Microsoft Entra ID | `groups` | `[]string` (GUIDs) | No | Manifest setting | - -### Google Workspace - -Google **does not include group membership in OIDC tokens or the userinfo -endpoint**. The ID token provides `sub`, `email`, `name`, `hd` (hosted -domain), `email_verified` — but no groups. - -**To get groups** you must call the Google Directory API: -- Endpoint: `admin.googleapis.com/admin/directory/v1/groups` -- Requires a GCP service account with domain-wide delegation -- Requires `https://www.googleapis.com/auth/admin.directory.group.readonly` scope -- Requires an admin email to impersonate in the API call - -This is a significant integration: service account JSON key in config, an -extra API call after every authentication, and Google-specific code. - -**Workarounds without groups:** -- `AllowedHostedDomains: ["corp.com"]` — restricts to the Google Workspace org -- `AllowedEmailDomains: ["corp.com"]` — same effect via email domain -- These cover "only our org" but not finer-grained access like "only engineering" - -**Options for full group support with Google:** - -1. **Google Directory API integration** — new config fields for service account - key and admin email. After token validation, call the Directory API to - resolve groups, inject into claims. Adds Google-specific dependency. - -2. **Generic group resolver webhook** — after auth, POST the user's - email/claims to a configurable URL, get back a list of groups. The URL - points to a sidecar/lambda that calls the Google Directory API. More - flexible, works for any IdP, but adds latency and an external dependency. - -3. **Accept the limitation** — for Google deployments, use domain-based - allowlists. For group-level granularity, use an IdP that puts groups - in tokens (Okta, Keycloak, Auth0). - -### Auth0 - -Auth0 uses "Roles" (not directory groups). Roles are **not in tokens by -default**. - -**To add roles to tokens**, create a post-login Action: - -```javascript -exports.onExecutePostLogin = async (event, api) => { - api.idToken.setCustomClaim( - 'https://myapp.example.com/groups', - event.authorization.roles - ); -}; -``` - -The claim **must** use a namespaced URI — non-namespaced custom claims are -silently dropped from ID tokens. - -**MCP config:** -```yaml -oauth: - group_claim: "https://myapp.example.com/groups" -``` - -The namespaced claim lands in the `Extra` map of `OAuthClaims`. The -`group_claim` config tells the policy validator which key to look up. - -### Okta - -- **Claim**: `groups` — not included by default -- **Config**: Security > API > Authorization Server > Claims > Add Claim. - Name: `groups`, Value type: `Groups`, Filter: `Matches regex .*` -- **Type**: `[]string` of group names, e.g. `["Everyone", "Engineering"]` -- **Limit**: >100 groups may be omitted from the token. Fallback to - `/userinfo` or Okta Groups API. - -### Keycloak - -- **Claim**: `groups` — requires a "Group Membership" protocol mapper -- **Type**: `[]string` — either flat names (`["admin"]`) or full paths - (`["/org/admin"]`) depending on mapper config -- **Alternative**: `realm_access.roles` — included by default, contains - realm-level role names - -### Microsoft Entra ID (Azure AD) - -- **Claim**: `groups` — contains **Object IDs (GUIDs)**, not names -- **Config**: Set `"groupMembershipClaims"` in the app manifest -- **Overage**: Azure AD limits tokens to 200 groups. When exceeded, the - `groups` claim is omitted and replaced with a Graph API endpoint reference. - The application must call Microsoft Graph to get the full list. -- **Alternative**: `roles` claim — contains App Role value names (strings, - not GUIDs), no overage problem. Configure via app registration. For MCP, - `group_claim: "roles"` may be simpler than directory groups. - -### Design implications - -1. **`group_claim` must be configurable** — default `"groups"`, but Auth0 - needs a namespaced URI, Azure AD might prefer `"roles"`. - -2. **Expect `[]string`** — all IdPs use string arrays. Also handle - space-separated strings as fallback. - -3. **Google requires special handling** — Directory API integration, - webhook, or accept that group-level control needs a different IdP. - -4. **Case sensitivity** — compare group names case-insensitively to avoid - `"Admin"` vs `"admin"` misconfiguration. - -5. **Overage (Azure AD, Okta)** — when the group list is too large, the - claim may be omitted. A fallback to userinfo could help but adds - complexity and an extra HTTP call per request. - -## Ops notes - -- MCP's `/auth/callback` must be reachable from ClickHouse. In k8s, both run - in the same namespace — use the service DNS name. -- If MCP restarts, all in-flight nonces are lost. Queries in progress fail; - clients retry normally. -- Monitor the nonce map size. Under normal load it stays small (one entry per - concurrent query, consumed in milliseconds). A leak would indicate callbacks - not arriving — check CH→MCP connectivity. diff --git a/docs/tool_annotations.md b/docs/tool_annotations.md deleted file mode 100644 index cc8105e3..00000000 --- a/docs/tool_annotations.md +++ /dev/null @@ -1,27 +0,0 @@ -# Tool Safety Hints - -Altinity MCP uses MCP tool annotations to describe how risky a tool is and whether it can affect resources outside a bounded target. - -## Hints - -- `readOnlyHint` - Set to `true` for tools that only read, retrieve, or compute data and do not create, update, delete, or send data outside the client. - -- `destructiveHint` - Set to `true` for tools that can delete, overwrite, or otherwise have irreversible side effects. - -- `openWorldHint` - Set to `true` for tools that can write to arbitrary external resources such as URLs, files, or other unbounded targets. Set to `false` for bounded writes to known resources. - -## Important Relationship - -Per the OpenAI Apps SDK guidance, `destructiveHint` and `openWorldHint` are only relevant for write-capable tools, meaning when `readOnlyHint=false`. - -## Official References - -- OpenAI Apps SDK reference: - [https://developers.openai.com/apps-sdk/reference](https://developers.openai.com/apps-sdk/reference) -- OpenAI Apps SDK MCP server guide: - [https://developers.openai.com/apps-sdk/build/mcp-server](https://developers.openai.com/apps-sdk/build/mcp-server) -- Model Context Protocol schema for `ToolAnnotations`: - [https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations) diff --git a/docs/tools.md b/docs/tools.md index 225275e0..473f1418 100644 --- a/docs/tools.md +++ b/docs/tools.md @@ -262,7 +262,7 @@ The server accepts this form but logs a deprecation warning at startup. Prefer ` ## Dynamic discovery -Discovery is **lazy** — it runs on the first authenticated tool call, not at startup. This matters for OAuth forward mode, where no ClickHouse credentials exist until a user request arrives. +Discovery is **lazy** — it runs on the first authenticated tool call, not at startup. This matters for OAuth, where ClickHouse credentials can be derived from the user request. After a successful discovery pass, the server emits `notifications/tools/list_changed` so compatible MCP clients refresh their tool list. @@ -275,13 +275,13 @@ If discovery fails (e.g. credential error, network issue), static tools remain a ### Credentials used for discovery -| Auth mode | Credentials used | -|-----------|------------------| +| Auth | Credentials used | +|------|------------------| | JWE | The per-request JWE token from the triggering call. | -| OAuth forward | The forwarded Bearer token from the triggering call. | +| OAuth | The Bearer token from the triggering call. | | Plain / no auth | Static `clickhouse.username` / `clickhouse.password` from config, if set. | -Static credentials are no longer **required** for discovery in JWE or OAuth-forward setups — whichever token arrives with the first authenticated call is used to probe `system.tables`. +Static credentials are no longer **required** for discovery in JWE or OAuth setups — whichever token arrives with the first authenticated call is used to probe `system.tables`. --- @@ -325,14 +325,30 @@ Examples: ## MCP safety hints -| Tool kind | `readOnlyHint` | `destructiveHint` | -|-----------|----------------|-------------------| -| `execute_query` | `true` | `false` | -| Dynamic read (view) | `true` | `false` | -| `write_query` | `false` | `true` | -| Dynamic write (`insert`) | `false` | `false` | - -These hints follow the OpenAI Apps SDK tool-annotation guidance and reduce unnecessary confirmation prompts in compatible clients. +Altinity MCP sets MCP tool annotations (the OpenAI Apps SDK tool-safety hints) so +compatible clients can gauge each tool's risk and skip unnecessary confirmation +prompts: + +- **`readOnlyHint`** — `true` for tools that only read, retrieve, or compute and + never create, update, delete, or send data outside the client. +- **`destructiveHint`** — `true` for tools that can delete, overwrite, or + otherwise cause irreversible side effects. Only meaningful when + `readOnlyHint=false`. +- **`openWorldHint`** — `true` for tools that write to arbitrary/unbounded + external targets (URLs, files, etc.); `false` for bounded writes to known + resources. Altinity MCP only ever touches ClickHouse, so this is `false` by + default. Only meaningful when `readOnlyHint=false`. + +| Tool kind | `readOnlyHint` | `destructiveHint` | `openWorldHint` | +|-----------|----------------|-------------------|-----------------| +| `execute_query` | `true` | `false` | `false` | +| Dynamic read (view) | `true` | `false` | `false` | +| `write_query` | `false` | `true` | `false` | +| Dynamic write (`insert`) | `false` | `false` | `false` | + +Any of these can be overridden per dynamic tool via the view/table `COMMENT` +JSON `annotations` object (see [Tool metadata via `COMMENT`](#tool-metadata-via-comment)) — +e.g. a view that fans out to an external API can set `openWorldHint: true`. References: @@ -340,6 +356,7 @@ References: - [Define tools](https://developers.openai.com/apps-sdk/plan/tools) - [Build your MCP server](https://developers.openai.com/apps-sdk/build/mcp-server) - [MCP concepts / server docs](https://developers.openai.com/apps-sdk/concepts/mcp-server) +- [MCP `ToolAnnotations` schema](https://modelcontextprotocol.io/specification/2025-06-18/schema#toolannotations) --- diff --git a/go.mod b/go.mod index 1e85c03e..df00ffb1 100644 --- a/go.mod +++ b/go.mod @@ -12,6 +12,7 @@ require ( github.com/stretchr/testify v1.11.1 github.com/urfave/cli/v3 v3.9.0 golang.org/x/net v0.55.0 + golang.org/x/sync v0.20.0 gopkg.in/yaml.v3 v3.0.1 ) @@ -19,7 +20,7 @@ require golang.org/x/crypto v0.52.0 // indirect require ( github.com/ClickHouse/ch-go v0.71.0 // indirect - github.com/altinity/go-mcp-oauth-sdk v0.1.0 + github.com/altinity/go-mcp-oauth-sdk v0.1.1-0.20260527145654-bdefa859fd1b github.com/andybalholm/brotli v1.2.1 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/davecgh/go-spew v1.1.1 // indirect diff --git a/go.sum b/go.sum index b9ec4658..245b944f 100644 --- a/go.sum +++ b/go.sum @@ -6,8 +6,8 @@ github.com/ClickHouse/ch-go v0.71.0 h1:bUdZ/EZj/LcVHsMqaRUP2holqygrPWQKeMjc6nZoy github.com/ClickHouse/ch-go v0.71.0/go.mod h1:NwbNc+7jaqfY58dmdDUbG4Jl22vThgx1cYjBw0vtgXw= github.com/ClickHouse/clickhouse-go/v2 v2.44.0 h1:9pxs5pRwIvhni5BDRPn/n5A8DeUod5TnBaeulFBX8EQ= github.com/ClickHouse/clickhouse-go/v2 v2.44.0/go.mod h1:giJfUVlMkcfUEPVfRpt51zZaGEx9i17gCos8gBl392c= -github.com/altinity/go-mcp-oauth-sdk v0.1.0 h1:CfenxSIv5KXRCiisImolm9Bk2U7BRoR5dHyElgnLY+8= -github.com/altinity/go-mcp-oauth-sdk v0.1.0/go.mod h1:yLgv0x586vIzPB5JaA6DkqQsSe4PLRT3PhU2iHO7qsI= +github.com/altinity/go-mcp-oauth-sdk v0.1.1-0.20260527145654-bdefa859fd1b h1:b6XHGgSAedsoRet5sahr4YABFC2tG+0Imd+8JGx16SY= +github.com/altinity/go-mcp-oauth-sdk v0.1.1-0.20260527145654-bdefa859fd1b/go.mod h1:yLgv0x586vIzPB5JaA6DkqQsSe4PLRT3PhU2iHO7qsI= github.com/andybalholm/brotli v1.2.1 h1:R+f5xP285VArJDRgowrfb9DqL18yVK0gKAW/F+eTWro= github.com/andybalholm/brotli v1.2.1/go.mod h1:rzTDkvFWvIrjDXZHkuS16NPggd91W3kUSvPlQ1pLaKY= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= @@ -76,6 +76,8 @@ golang.org/x/net v0.55.0 h1:bcvxaJn3e1U6InsFWt1JUq1aSjnRxLzT2rtD2KfkDF8= golang.org/x/net v0.55.0/go.mod h1:L5U2KuzuOe1lY7Z+aWVIKK6qEeJXnXV9yzGA+WCHJww= golang.org/x/oauth2 v0.36.0 h1:peZ/1z27fi9hUOFCAZaHyrpWG5lwe0RJEEEeH0ThlIs= golang.org/x/oauth2 v0.36.0/go.mod h1:YDBUJMTkDnJS+A4BP4eZBjCqtokkg1hODuPjwiGPO7Q= +golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4= +golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= golang.org/x/sys v0.45.0 h1:dO4czNzziLiiXplLQgBCEpCvXQ3dnkn0SdaZSYdQ+FY= golang.org/x/sys v0.45.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= golang.org/x/text v0.37.0 h1:Cqjiwd9eSg8e0QAkyCaQTNHFIIzWtidPahFWR83rTrc= diff --git a/helm/altinity-mcp/values.yaml b/helm/altinity-mcp/values.yaml index 59f7d4d9..0670edaa 100644 --- a/helm/altinity-mcp/values.yaml +++ b/helm/altinity-mcp/values.yaml @@ -153,6 +153,11 @@ env: [] # Configuration for the altinity-mcp application config: clickhouse: + # -- ClickHouse host. In multi-cluster mode (config.multicluster.enabled), + # the literal `{cluster}` placeholder is substituted with the validated + # {cluster} path value from /mcp/{cluster}. Typical k8s shape: + # host: "chi-{cluster}-{cluster}-0-0.demo" + # Single-cluster: use the static hostname directly. host: "localhost" port: 8123 database: "default" @@ -161,7 +166,9 @@ config: protocol: "http" read_only: false max_execution_time: 30 - limit: 1000 + # Server-side result cap (PR #125). Replaces the deprecated `limit` + # SQL-rewrite knob. 0 = use the runtime default. + max_result_rows: 1000 tls: enabled: false ca_cert: "" @@ -213,6 +220,28 @@ config: dynamic_tools: [] # - regexp: "db\\..*" # prefix: "custom_" - + + # -- Multi-cluster routing (#132). When enabled, MCP serves N clusters + # from one process via /mcp/{cluster}. clickhouse.host must contain the + # literal `{cluster}` placeholder (substituted at request time with the + # validated cluster name). Incompatible with server.jwe.enabled and + # server.openapi.enabled (deferred to v2). + multicluster: + # -- Enable URL-path multi-cluster routing + enabled: false + # -- Whitelist of cluster names accepted in /mcp/{cluster}. Must each + # be a valid RFC 1123 DNS label (^[a-z0-9]([-a-z0-9]{0,61}[a-z0-9])?$). + # An empty list allows any valid RFC 1123 cluster name. + cluster_allowlist: [] + # -- Maximum number of (bearer, cluster) catalog cache entries. + # Default 10000. Min 100. + catalog_cache_max: 10000 + # -- Fallback TTL for catalog cache entries when the JWT carries no + # exp claim. Allowed range: 1m .. 24h. + catalog_ttl_fallback: 15m + # -- TTL for denied (auth-class) catalog cache entries. Allowed range: + # 10s .. 5m. + catalog_negative_ttl: 60s + logging: level: "info" diff --git a/helm/altinity-mcp/values_examples/mcp-multicluster.yaml b/helm/altinity-mcp/values_examples/mcp-multicluster.yaml new file mode 100644 index 00000000..d1fc0c7a --- /dev/null +++ b/helm/altinity-mcp/values_examples/mcp-multicluster.yaml @@ -0,0 +1,68 @@ +# Example values for altinity-mcp serving N ClickHouse clusters from one +# helm release via URL-path routing (#132). +# +# Request URL shape: +# https:///mcp/{cluster} +# where {cluster} is one of `cluster_allowlist` and the literal +# `{cluster}` placeholder in clickhouse.host is substituted at request +# time. All other clickhouse.* fields are shared across clusters. +# +# Pre-requisites in the demo namespace: +# * Each ClickHouse pod (cluster otel, antalya, ...) must have the +# ch-jwt-verify sidecar from github.com/altinity/altinity-oauth-helper +# deployed alongside it, with audience pinned to the shared MCP +# resource URL below. +# * Per-user CH provisioning on every cluster: +# CREATE USER "alice@example.com" IDENTIFIED WITH http SERVER 'ch_jwt_verify' SCHEME 'BASIC'; + +image: + tag: "main" + pullPolicy: Always + +config: + clickhouse: + # The literal {cluster} placeholder is replaced with the validated + # cluster name from /mcp/{cluster} (RFC 1123 DNS label, must appear + # in cluster_allowlist below). + host: "chi-{cluster}-{cluster}-0-0.demo" + port: 8443 + protocol: "http" + read_only: false + + server: + transport: "http" + # OpenAPI is rejected at config-load in multi-cluster v1 — defer the + # per-cluster schema split to v2. + openapi: + enabled: false + oauth: + enabled: true + broker: false + issuer: "https://accounts.google.com" + jwks_url: "https://www.googleapis.com/oauth2/v3/certs" + # Shared audience across clusters in v1. Per-cluster audience is v2. + audience: "https://multi-mcp.demo.altinity.cloud/" + public_resource_url: "https://multi-mcp.demo.altinity.cloud/" + required_scopes: + - "openid" + - "email" + + multicluster: + enabled: true + cluster_allowlist: + - "otel" + - "antalya" + catalog_cache_max: 10000 + catalog_ttl_fallback: "15m" + catalog_negative_ttl: "60s" + + logging: + level: "info" + +ingress: + enabled: true + hosts: + - host: multi-mcp.demo.altinity.cloud + paths: + - path: / + pathType: Prefix diff --git a/pkg/config/cli_reflect.go b/pkg/config/cli_reflect.go index da3c3aff..4412a919 100644 --- a/pkg/config/cli_reflect.go +++ b/pkg/config/cli_reflect.go @@ -4,10 +4,16 @@ import ( "fmt" "reflect" "strconv" + "time" "github.com/urfave/cli/v3" ) +// durationType is the reflect.Type for time.Duration. Compared by type (not +// kind) because Duration is an int64 alias; without the type-level check we +// would route Durations through the int branch and lose unit parsing. +var durationType = reflect.TypeOf(time.Duration(0)) + // Command is the subset of urfave/cli/v3 *cli.Command that ApplyFlags needs. // It mirrors the same interface used by overrideWithCLIFlags so the helper // stays testable with a fake. @@ -94,6 +100,18 @@ func makeFlag(flagName, envVar, desc, defaultStr string, value reflect.Value) cl if envVar != "" { sources = cli.EnvVars(envVar) } + // time.Duration is technically Kind()==Int64 — handle it as a string flag + // with time.ParseDuration so operators can write "15m" / "60s" instead of + // strconv.Atoi("15m") tripping at flag-build time. + if value.Type() == durationType { + defaultVal := defaultStr + if defaultStr != "" { + if _, err := time.ParseDuration(defaultStr); err != nil { + panic(fmt.Sprintf("config: bad default for duration flag %s: %v", flagName, err)) + } + } + return &cli.StringFlag{Name: flagName, Usage: desc, Value: defaultVal, Sources: sources} + } switch value.Kind() { case reflect.String: return &cli.StringFlag{Name: flagName, Usage: desc, Value: defaultStr, Sources: sources} @@ -130,6 +148,20 @@ func makeFlag(flagName, envVar, desc, defaultStr string, value reflect.Value) cl } func applyOne(flagName, defaultStr string, value reflect.Value, cmd Command) { + // time.Duration: flag is a StringFlag carrying a parseable duration. + if value.Type() == durationType { + switch { + case cmd.IsSet(flagName): + if d, err := time.ParseDuration(cmd.String(flagName)); err == nil { + value.SetInt(int64(d)) + } + case value.IsZero() && defaultStr != "": + if d, err := time.ParseDuration(defaultStr); err == nil { + value.SetInt(int64(d)) + } + } + return + } switch value.Kind() { case reflect.String: switch { diff --git a/pkg/config/cli_reflect_test.go b/pkg/config/cli_reflect_test.go index 70c5301d..067f1f7b 100644 --- a/pkg/config/cli_reflect_test.go +++ b/pkg/config/cli_reflect_test.go @@ -2,6 +2,7 @@ package config import ( "testing" + "time" "github.com/stretchr/testify/require" "github.com/urfave/cli/v3" @@ -48,7 +49,7 @@ func TestBuildFlags_ConfigStruct(t *testing.T) { require.Contains(t, byName, "blocked-query-clauses") // All OAuth fields should be wired (issue #96). - require.Contains(t, byName, "oauth-mode") + require.Contains(t, byName, "oauth-broker") require.Contains(t, byName, "oauth-enabled") require.Contains(t, byName, "oauth-issuer") require.Contains(t, byName, "oauth-jwks-url") @@ -85,10 +86,9 @@ func TestApplyFlags_SetsValues(t *testing.T) { "clickhouse-host": "ch.internal", "oauth-signing-secret": "shh", "transport": "http", - "oauth-mode": "forward", }, - ints: map[string]int{"clickhouse-port": 9000}, - bools: map[string]bool{"server-tls": true, "oauth-enabled": true}, + ints: map[string]int{"clickhouse-port": 9000}, + bools: map[string]bool{"server-tls": true, "oauth-enabled": true, "oauth-broker": true}, slices: map[string][]string{"oauth-required-scopes": {"openid", "email"}}, wasSet: map[string]bool{ "clickhouse-host": true, @@ -98,7 +98,7 @@ func TestApplyFlags_SetsValues(t *testing.T) { "oauth-signing-secret": true, "oauth-required-scopes": true, "transport": true, - "oauth-mode": true, + "oauth-broker": true, }, } @@ -108,13 +108,13 @@ func TestApplyFlags_SetsValues(t *testing.T) { require.Equal(t, 9000, cfg.ClickHouse.Port) require.True(t, cfg.Server.TLS.Enabled) require.True(t, cfg.Server.OAuth.Enabled) + require.True(t, cfg.Server.OAuth.Broker) require.Equal(t, "shh", cfg.Server.OAuth.SigningSecret) require.Equal(t, []string{"openid", "email"}, cfg.Server.OAuth.RequiredScopes) // Type-alias conversion: cmd.String returns a plain string, but the // struct field is MCPTransport — Convert() handles that. require.Equal(t, MCPTransport("http"), cfg.Server.Transport) - require.Equal(t, "forward", cfg.Server.OAuth.Mode) } func TestApplyFlags_DefaultFallback(t *testing.T) { @@ -166,6 +166,30 @@ func TestApplyFlags_CLIBeatsYAML(t *testing.T) { require.Equal(t, "from-cli.example", cfg.ClickHouse.Host) } +func TestBuildFlags_Duration_BuildsAndApplies(t *testing.T) { + t.Parallel() + type S struct { + TTL time.Duration `flag:"ttl" default:"15m" desc:"ttl"` + } + flags := BuildFlags(&S{}) + require.Len(t, flags, 1) + require.IsType(t, &cli.StringFlag{}, flags[0]) + require.Equal(t, "15m", flags[0].(*cli.StringFlag).Value) + + // Apply with default fallback (no CLI flag set, zero field). + s := &S{} + ApplyFlags(s, &fakeCmd{wasSet: map[string]bool{}}) + require.Equal(t, 15*time.Minute, s.TTL) + + // Apply with explicit CLI value overrides any in-struct value. + s = &S{TTL: 5 * time.Minute} + ApplyFlags(s, &fakeCmd{ + strs: map[string]string{"ttl": "1h30m"}, + wasSet: map[string]bool{"ttl": true}, + }) + require.Equal(t, 90*time.Minute, s.TTL) +} + func TestBuildFlags_NoFlagTagSkipped(t *testing.T) { t.Parallel() type S struct { diff --git a/pkg/config/config.go b/pkg/config/config.go index 9a821342..c573d80a 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -6,14 +6,14 @@ import ( "os" "path/filepath" "strings" + "time" "github.com/altinity/go-mcp-oauth-sdk/oauth" "gopkg.in/yaml.v3" ) // OAuthConfig is an alias for oauth.OAuthConfig so existing call sites that -// reference config.OAuthConfig continue to compile. The struct definition and -// the NormalizedMode/IsForwardMode/IsGatingMode helpers live in pkg/oauth. +// reference config.OAuthConfig continue to compile. type OAuthConfig = oauth.OAuthConfig // ClickHouseProtocol defines the protocol used to connect to ClickHouse @@ -207,12 +207,57 @@ type LoggingConfig struct { Level LogLevel `json:"level" yaml:"level" flag:"log-level" env:"LOG_LEVEL" default:"info" desc:"Logging level (debug/info/warn/error)"` } +// MulticlusterConfig enables URL-path routing across multiple ClickHouse +// clusters from a single MCP process. When Enabled, requests to +// /mcp/{cluster} extract {cluster} from the path, validate it against the +// allowlist + RFC 1123 DNS label regex, and substitute it into the +// `{cluster}` placeholder in ClickHouseConfig.Host. All other ClickHouse +// settings (port, TLS, mode, regexes, limits, ...) are shared across +// clusters. See issue #132 and docs/multicluster.md. +type MulticlusterConfig struct { + Enabled bool `json:"enabled" yaml:"enabled" flag:"multicluster-enabled" env:"MCP_MULTICLUSTER_ENABLED" desc:"Enable URL-path routing across multiple ClickHouse clusters via /mcp/{cluster}"` + ClusterAllowlist []string `json:"cluster_allowlist" yaml:"cluster_allowlist" flag:"multicluster-cluster-allowlist" env:"MCP_MULTICLUSTER_CLUSTER_ALLOWLIST" desc:"Whitelist of cluster names accepted in /mcp/{cluster}"` + CatalogCacheMax int `json:"catalog_cache_max,omitempty" yaml:"catalog_cache_max,omitempty" flag:"multicluster-catalog-cache-max" env:"MCP_MULTICLUSTER_CATALOG_CACHE_MAX" desc:"Maximum number of (bearer,cluster) catalog cache entries (default 10000)"` + CatalogTTLFallback time.Duration `json:"catalog_ttl_fallback,omitempty" yaml:"catalog_ttl_fallback,omitempty" flag:"multicluster-catalog-ttl-fallback" env:"MCP_MULTICLUSTER_CATALOG_TTL_FALLBACK" desc:"Fallback TTL for catalog cache entries when bearer has no exp (default 15m)"` + CatalogNegativeTTL time.Duration `json:"catalog_negative_ttl,omitempty" yaml:"catalog_negative_ttl,omitempty" flag:"multicluster-catalog-negative-ttl" env:"MCP_MULTICLUSTER_CATALOG_NEGATIVE_TTL" desc:"TTL for negative (denied) catalog cache entries (default 60s)"` +} + +// Defaults applied by applyMulticlusterDefaults. +const ( + defaultMulticlusterCatalogCacheMax = 10000 + defaultMulticlusterCatalogTTLFallback = 15 * time.Minute + defaultMulticlusterCatalogNegativeTTL = 60 * time.Second +) + +// applyMulticlusterDefaults fills zero-valued Multicluster fields with the +// documented defaults. Safe to call on both Enabled and disabled configs; +// caller decides whether to consult the values. +func applyMulticlusterDefaults(mc *MulticlusterConfig) { + if mc.CatalogCacheMax == 0 { + mc.CatalogCacheMax = defaultMulticlusterCatalogCacheMax + } + if mc.CatalogTTLFallback == 0 { + mc.CatalogTTLFallback = defaultMulticlusterCatalogTTLFallback + } + if mc.CatalogNegativeTTL == 0 { + mc.CatalogNegativeTTL = defaultMulticlusterCatalogNegativeTTL + } +} + +// ApplyMulticlusterDefaults fills zero-valued multicluster settings with the +// documented defaults. Call this after config-file loading and again after +// CLI/env overrides so env-only deployments get the same defaults. +func ApplyMulticlusterDefaults(cfg *Config) { + applyMulticlusterDefaults(&cfg.Multicluster) +} + // Config is the main application configuration type Config struct { - ClickHouse ClickHouseConfig `json:"clickhouse" yaml:"clickhouse"` - Server ServerConfig `json:"server" yaml:"server"` - Logging LoggingConfig `json:"logging" yaml:"logging"` - ReloadTime int `json:"reload_time,omitempty" yaml:"reload_time,omitempty" desc:"Configuration reload interval in seconds (0 to disable)"` + ClickHouse ClickHouseConfig `json:"clickhouse" yaml:"clickhouse"` + Server ServerConfig `json:"server" yaml:"server"` + Logging LoggingConfig `json:"logging" yaml:"logging"` + Multicluster MulticlusterConfig `json:"multicluster" yaml:"multicluster"` + ReloadTime int `json:"reload_time,omitempty" yaml:"reload_time,omitempty" desc:"Configuration reload interval in seconds (0 to disable)"` // RemovedKeyWarnings holds human-readable warnings about config keys // that LoadConfigFromFile observed in the input file but the current @@ -252,6 +297,7 @@ func LoadConfigFromFile(filename string) (*Config, error) { } } config.RemovedKeyWarnings = removedKeyWarnings(data) + ApplyMulticlusterDefaults(config) return config, nil } @@ -263,10 +309,12 @@ func LoadConfigFromFile(filename string) (*Config, error) { // override is now a no-op. Exported so external tooling (linters, CI // gates, deploy automation) can share the same source of truth. var RemovedConfigKeys = []RemovedKey{ - {Path: "clickhouse.cluster_secret", Replacement: "Use mode: gating + the ch-jwt-verify sidecar (github.com/altinity/altinity-oauth-helper). Drop cluster_secret + cluster_name from helm values and bind users with IDENTIFIED WITH http SERVER 'ch_jwt_verify' SCHEME 'BASIC'."}, + {Path: "clickhouse.cluster_secret", Replacement: "Use broker:false with the ch-jwt-verify sidecar (github.com/altinity/altinity-oauth-helper). Drop cluster_secret + cluster_name from helm values and bind users with IDENTIFIED WITH http SERVER 'ch_jwt_verify' SCHEME 'BASIC'."}, {Path: "clickhouse.cluster_name", Replacement: "Same as cluster_secret — drop both together."}, - {Path: "server.oauth.claims_to_headers", Replacement: "Removed — the gating-mode wire format no longer forwards arbitrary claims as headers. Per-scope ClickHouse session settings live in the sidecar's settings_from_scope config."}, - {Path: "server.oauth.clickhouse_header_name", Replacement: "Removed — forward mode always uses Authorization: Bearer."}, + {Path: "server.oauth.mode", Replacement: "Removed — use server.oauth.broker: true or false."}, + {Path: "server.oauth.broker_upstream", Replacement: "Removed — use server.oauth.broker: true."}, + {Path: "server.oauth.claims_to_headers", Replacement: "Removed — MCP no longer forwards arbitrary claims as headers. Per-scope ClickHouse session settings live in the sidecar's settings_from_scope config."}, + {Path: "server.oauth.clickhouse_header_name", Replacement: "Removed — Bearer auth uses Authorization: Bearer."}, {Path: "server.oauth.allowed_email_domains", Replacement: "Moved to the ch-jwt-verify sidecar's identity.allowed_email_domains."}, {Path: "server.oauth.allowed_hosted_domains", Replacement: "Moved to the ch-jwt-verify sidecar's identity.allowed_hosted_domains."}, {Path: "server.oauth.allow_unverified_email", Replacement: "Moved (inverted) to the ch-jwt-verify sidecar's identity.require_email_verified."}, diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index 504fe485..1cfe08b4 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -358,48 +358,6 @@ func TestConfigConstants(t *testing.T) { }) } -// TestNormalizedMode tests OAuthConfig.NormalizedMode() -func TestNormalizedMode(t *testing.T) { - t.Parallel() - tests := []struct { - name string - mode string - want string - }{ - {"forward", "forward", "forward"}, - {"gating", "gating", "gating"}, - {"empty_defaults_to_gating", "", "gating"}, - {"uppercase_forward", "FORWARD", "forward"}, - {"mixed_case_gating", "Gating", "gating"}, - {"whitespace_trimmed", " forward ", "forward"}, - {"unknown_mode_passthrough", "custom", "custom"}, - {"another_unknown", "hybrid", "hybrid"}, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - t.Parallel() - cfg := OAuthConfig{Mode: tt.mode} - require.Equal(t, tt.want, cfg.NormalizedMode()) - }) - } -} - -// TestIsForwardMode tests OAuthConfig.IsForwardMode() -func TestIsForwardMode(t *testing.T) { - t.Parallel() - require.True(t, OAuthConfig{Mode: "forward"}.IsForwardMode()) - require.False(t, OAuthConfig{Mode: "gating"}.IsForwardMode()) - require.False(t, OAuthConfig{Mode: ""}.IsForwardMode()) -} - -// TestIsGatingMode tests OAuthConfig.IsGatingMode() -func TestIsGatingMode(t *testing.T) { - t.Parallel() - require.True(t, OAuthConfig{Mode: "gating"}.IsGatingMode()) - require.True(t, OAuthConfig{Mode: ""}.IsGatingMode()) // default - require.False(t, OAuthConfig{Mode: "forward"}.IsGatingMode()) -} - // TestLoadConfigFromFile_YMLExtension tests .yml extension loading func TestLoadConfigFromFile_YMLExtension(t *testing.T) { t.Parallel() @@ -779,6 +737,8 @@ clickhouse: cluster_name: "demo" server: oauth: + mode: gating + broker_upstream: true claims_to_headers: sub: X-User clickhouse_header_name: X-Token @@ -813,7 +773,7 @@ clickhouse: server: oauth: enabled: true - mode: gating + issuer: https://issuer.example.com `) require.Nil(t, removedKeyWarnings(yamlData)) }) diff --git a/pkg/server/catalog_cache.go b/pkg/server/catalog_cache.go new file mode 100644 index 00000000..f5ec0466 --- /dev/null +++ b/pkg/server/catalog_cache.go @@ -0,0 +1,410 @@ +package server + +import ( + "context" + "errors" + "math/rand" + "sync" + "sync/atomic" + "time" + + "github.com/altinity/altinity-mcp/pkg/clickhouse" + "github.com/altinity/altinity-mcp/pkg/config" + "github.com/rs/zerolog/log" + "golang.org/x/sync/singleflight" +) + +const ( + // negativeEvictionThreshold is the fill ratio above which insertion of a + // new entry randomly evicts an existing `denied` entry. Positives are + // never displaced. Keeps the cache from being filled with attack churn. + negativeEvictionThreshold = 0.7 + // maxConcurrentDiscoveries is the hardcoded ceiling on simultaneous + // tool-discovery round-trips to ClickHouse. A single discovery is a + // handful of system-table queries and completes well within the + // discoveryTimeout below; the ceiling exists so a connector storm + // (many distinct bearers missing the cache at once) cannot exhaust the + // CH pod's connection pool. v1 decision: not configurable — re-evaluate + // at v2. + maxConcurrentDiscoveries = 16 + // discoveryTimeout bounds an individual DiscoverTools invocation + // triggered by GetOrDiscover, separate from any outer ctx timeout. A + // timeout is classified transient (not cached), so the next request + // retries — keep this comfortably above normal discovery latency so a + // busy-but-healthy cluster is not perpetually forced onto static-only. + discoveryTimeout = 3 * time.Second +) + +// ErrDiscoveryDenied signals that the catalog cache memoised an auth-class +// rejection for this (bearer, cluster). Callers should fall back to +// static-only tools without re-attempting discovery until the cached +// entry expires. +var ErrDiscoveryDenied = errors.New("multicluster: catalog discovery denied (cached)") + +// ErrDiscoverySaturated signals that all maxConcurrentDiscoveries slots +// are in use and the caller's ctx fired before one freed. Callers should +// fall back to static-only tools and emit a counter. +var ErrDiscoverySaturated = errors.New("multicluster: discovery concurrency saturated") + +type catalogOutcome int + +const ( + catalogOutcomeOK catalogOutcome = iota + catalogOutcomeDenied +) + +// catalogEntry is one row in the catalog cache. Tools is nil when Outcome +// is catalogOutcomeDenied. ErrClass is informational only. +type catalogEntry struct { + Outcome catalogOutcome + Tools map[string]dynamicToolMeta + ExpiresAt time.Time + ErrClass DiscoveryErrorClass +} + +// CatalogCacheMetrics carries the catalog cache's atomic counters. The +// process has no Prometheus subsystem; these are surfaced as a JSON +// snapshot on the /health endpoint (see CatalogCache.Snapshot and the +// healthHandler in cmd/altinity-mcp). Kept as a plain struct of atomics so +// the cache layer has no metrics-backend dependency. +type CatalogCacheMetrics struct { + HitsOK atomic.Uint64 + HitsDenied atomic.Uint64 + Misses atomic.Uint64 + FullDropsOK atomic.Uint64 + FullDropsDenied atomic.Uint64 + DiscoverySaturate atomic.Uint64 + DiscoveryAuthErr atomic.Uint64 + DiscoveryTransErr atomic.Uint64 +} + +// CatalogCacheSnapshot is a point-in-time, JSON-serialisable view of the +// catalog cache's counters plus live size, returned by Snapshot for the +// /health endpoint. +type CatalogCacheSnapshot struct { + Entries int `json:"entries"` + DeniedEntries int `json:"denied_entries"` + Max int `json:"max"` + HitsOK uint64 `json:"hits_ok"` + HitsDenied uint64 `json:"hits_denied"` + Misses uint64 `json:"misses"` + FullDropsOK uint64 `json:"full_drops_ok"` + FullDropsDenied uint64 `json:"full_drops_denied"` + DiscoverySaturate uint64 `json:"discovery_saturate"` + DiscoveryAuthErr uint64 `json:"discovery_auth_err"` + DiscoveryTransErr uint64 `json:"discovery_transient_err"` +} + +// Snapshot returns a consistent read of the cache counters and current +// occupancy. Safe to call concurrently with cache operations. +func (c *CatalogCache) Snapshot() CatalogCacheSnapshot { + c.mu.Lock() + entries := len(c.entries) + denied := len(c.deniedKeys) + c.mu.Unlock() + return CatalogCacheSnapshot{ + Entries: entries, + DeniedEntries: denied, + Max: c.max, + HitsOK: c.Metrics.HitsOK.Load(), + HitsDenied: c.Metrics.HitsDenied.Load(), + Misses: c.Metrics.Misses.Load(), + FullDropsOK: c.Metrics.FullDropsOK.Load(), + FullDropsDenied: c.Metrics.FullDropsDenied.Load(), + DiscoverySaturate: c.Metrics.DiscoverySaturate.Load(), + DiscoveryAuthErr: c.Metrics.DiscoveryAuthErr.Load(), + DiscoveryTransErr: c.Metrics.DiscoveryTransErr.Load(), + } +} + +// ClientFactory creates a ClickHouse client from a context + chCfg. The +// catalog cache passes through the request ctx so credential extraction +// (bearer in ctx) works as in single-cluster mode. +type ClientFactory func(ctx context.Context, chCfg config.ClickHouseConfig) (*clickhouse.Client, error) + +// CatalogCache memoises (bearer, cluster) → discovered dynamic tools. +// Thread-safe; one instance per MCP process. Stop with Close() during +// shutdown to terminate the janitor goroutine. +type CatalogCache struct { + mu sync.Mutex + entries map[string]catalogEntry + deniedKeys []string // shadow index of denied-only entries for O(1) eviction + max int + fallbackTTL time.Duration + negativeTTL time.Duration + sf singleflight.Group + sem chan struct{} + rng *rand.Rand + rngMu sync.Mutex + stop chan struct{} + stopped atomic.Bool + Metrics CatalogCacheMetrics +} + +// NewCatalogCache creates a catalog cache sized per cfg and starts the +// janitor goroutine. cfg must already have had applyMulticlusterDefaults +// run on it (caller responsibility). +func NewCatalogCache(cfg config.MulticlusterConfig) *CatalogCache { + if cfg.CatalogCacheMax <= 0 { + cfg.CatalogCacheMax = defaultMulticlusterCacheMax + } + if cfg.CatalogTTLFallback <= 0 { + cfg.CatalogTTLFallback = defaultMulticlusterFallbackTTL + } + if cfg.CatalogNegativeTTL <= 0 { + cfg.CatalogNegativeTTL = defaultMulticlusterNegativeTTL + } + + c := &CatalogCache{ + entries: make(map[string]catalogEntry, cfg.CatalogCacheMax), + max: cfg.CatalogCacheMax, + fallbackTTL: cfg.CatalogTTLFallback, + negativeTTL: cfg.CatalogNegativeTTL, + sem: make(chan struct{}, maxConcurrentDiscoveries), + rng: rand.New(rand.NewSource(time.Now().UnixNano())), + stop: make(chan struct{}), + } + go c.janitor() + return c +} + +// Defaults mirrored from config to avoid a circular import: NewCatalogCache +// is robust to a zero-valued MulticlusterConfig even if the operator +// constructs one by hand. +const ( + defaultMulticlusterCacheMax = 10000 + defaultMulticlusterFallbackTTL = 15 * time.Minute + defaultMulticlusterNegativeTTL = 60 * time.Second + catalogCacheJanitorIntervalSecs = 60 +) + +// Close terminates the janitor goroutine. Safe to call multiple times. +func (c *CatalogCache) Close() { + if c.stopped.CompareAndSwap(false, true) { + close(c.stop) + } +} + +func (c *CatalogCache) janitor() { + ticker := time.NewTicker(catalogCacheJanitorIntervalSecs * time.Second) + defer ticker.Stop() + for { + select { + case <-c.stop: + return + case now := <-ticker.C: + c.sweepExpired(now) + } + } +} + +func (c *CatalogCache) sweepExpired(now time.Time) { + c.mu.Lock() + defer c.mu.Unlock() + for k, e := range c.entries { + if !e.ExpiresAt.IsZero() && now.After(e.ExpiresAt) { + delete(c.entries, k) + } + } + // Rebuild deniedKeys to drop swept-out denied entries. + if len(c.deniedKeys) > 0 { + filtered := c.deniedKeys[:0] + for _, k := range c.deniedKeys { + if e, ok := c.entries[k]; ok && e.Outcome == catalogOutcomeDenied { + filtered = append(filtered, k) + } + } + c.deniedKeys = filtered + } +} + +// fullKey assembles the lookup key. Bearer hash + null + cluster: the null +// byte is impossible in a hex string so there is no aliasing risk and the +// concatenation is unambiguous. +func fullKey(cacheKey, cluster string) string { + return cacheKey + "\x00" + cluster +} + +// GetOrDiscover returns the cached catalog for (cacheKey, cluster), or +// runs DiscoverTools under a singleflight + concurrency-limited semaphore +// to populate it. The bearerExp parameter is the JWT exp claim (zero +// if absent); positive entries' TTL is min(bearerExp, now+fallbackTTL). +// +// Returns ErrDiscoveryDenied when the (cacheKey, cluster) is currently +// memoised as a denied entry; callers should fall back to static-only +// without retrying. Returns ErrDiscoverySaturated when discovery slots +// are exhausted and the caller's ctx fires before one frees; same +// fallback behavior. +func (c *CatalogCache) GetOrDiscover( + ctx context.Context, + cacheKey, cluster string, + reqCfg config.ClickHouseConfig, + factory ClientFactory, + rules []config.DynamicToolRule, + readOnly bool, + bearerExp time.Time, +) (map[string]dynamicToolMeta, error) { + key := fullKey(cacheKey, cluster) + + // Fast path: existing entry under read-style lock. + c.mu.Lock() + if e, ok := c.entries[key]; ok && (e.ExpiresAt.IsZero() || time.Now().Before(e.ExpiresAt)) { + c.mu.Unlock() + switch e.Outcome { + case catalogOutcomeOK: + c.Metrics.HitsOK.Add(1) + return e.Tools, nil + case catalogOutcomeDenied: + c.Metrics.HitsDenied.Add(1) + return nil, ErrDiscoveryDenied + } + } + c.mu.Unlock() + c.Metrics.Misses.Add(1) + + // Coalesce concurrent misses on the same key. + v, err, _ := c.sf.Do(key, func() (interface{}, error) { + // Re-check after acquiring the singleflight slot (another inflight + // caller may have populated the entry). + c.mu.Lock() + if e, ok := c.entries[key]; ok && (e.ExpiresAt.IsZero() || time.Now().Before(e.ExpiresAt)) { + c.mu.Unlock() + if e.Outcome == catalogOutcomeOK { + return e.Tools, nil + } + return nil, ErrDiscoveryDenied + } + c.mu.Unlock() + + // Acquire a discovery slot or fail under ctx. + select { + case c.sem <- struct{}{}: + defer func() { <-c.sem }() + case <-ctx.Done(): + c.Metrics.DiscoverySaturate.Add(1) + return nil, ErrDiscoverySaturated + } + + ctxTimeout, cancel := context.WithTimeout(ctx, discoveryTimeout) + defer cancel() + + tools, derr := DiscoverTools(ctxTimeout, reqCfg, factory, rules, readOnly) + if derr != nil { + auth, class := ClassifyDiscoveryError(derr) + if auth { + c.Metrics.DiscoveryAuthErr.Add(1) + c.insertDenied(key, class) + } else { + c.Metrics.DiscoveryTransErr.Add(1) + } + return nil, derr + } + c.insertOK(key, tools, bearerExp) + return tools, nil + }) + if err != nil { + return nil, err + } + if tools, ok := v.(map[string]dynamicToolMeta); ok { + return tools, nil + } + return nil, nil +} + +func (c *CatalogCache) insertOK(key string, tools map[string]dynamicToolMeta, bearerExp time.Time) { + c.mu.Lock() + defer c.mu.Unlock() + exp := time.Now().Add(c.fallbackTTL) + if !bearerExp.IsZero() && bearerExp.Before(exp) { + exp = bearerExp + } + if !c.makeRoomLocked(catalogOutcomeOK) { + log.Warn().Str("key_suffix", key[len(key)-12:]).Msg("catalog_cache: full; dropping positive entry") + c.Metrics.FullDropsOK.Add(1) + return + } + c.entries[key] = catalogEntry{ + Outcome: catalogOutcomeOK, + Tools: tools, + ExpiresAt: exp, + } +} + +func (c *CatalogCache) insertDenied(key string, class DiscoveryErrorClass) { + c.mu.Lock() + defer c.mu.Unlock() + if !c.makeRoomLocked(catalogOutcomeDenied) { + c.Metrics.FullDropsDenied.Add(1) + return + } + c.entries[key] = catalogEntry{ + Outcome: catalogOutcomeDenied, + ExpiresAt: time.Now().Add(c.negativeTTL), + ErrClass: class, + } + c.deniedKeys = append(c.deniedKeys, key) +} + +// makeRoomLocked guarantees space for one new insert. Caller holds c.mu. +// +// At fill ≥ 70% of max, attempts random eviction of a denied entry. +// Positives are never displaced — under churning attack traffic this +// degrades gracefully to "no more denied entries get cached" rather than +// thrashing legitimate-user catalogs. +// +// Returns false only when the cache is at hard cap AND no denied entry is +// available to evict AND outcome is denied (positive inserts are always +// allowed past the threshold by spilling — the hard cap is informational +// because positive entries are bounded by the (bearer, cluster) cardinality +// the OAuth issuer is willing to mint). +func (c *CatalogCache) makeRoomLocked(outcome catalogOutcome) bool { + cur := len(c.entries) + if cur < c.max { + // Even below the threshold we want random eviction once fill ≥ 70%. + if float64(cur)/float64(c.max) >= negativeEvictionThreshold { + c.evictOneDeniedLocked() + } + return true + } + // At/above hard cap: evict a denied entry to make room, or drop on + // denied insert (positive insert is allowed to exceed cap by 1 — + // preferable to losing a legitimate user's catalog). + if c.evictOneDeniedLocked() { + return true + } + if outcome == catalogOutcomeOK { + return true + } + return false +} + +func (c *CatalogCache) evictOneDeniedLocked() bool { + // Walk deniedKeys from the back, dropping stale entries (already + // expired / no longer denied). The first valid one we hit gets + // random-evicted by swap-remove. + for len(c.deniedKeys) > 0 { + idx := c.rngIntn(len(c.deniedKeys)) + key := c.deniedKeys[idx] + // Swap-remove from deniedKeys regardless. + last := len(c.deniedKeys) - 1 + c.deniedKeys[idx] = c.deniedKeys[last] + c.deniedKeys = c.deniedKeys[:last] + + e, ok := c.entries[key] + if !ok || e.Outcome != catalogOutcomeDenied { + continue + } + delete(c.entries, key) + return true + } + return false +} + +func (c *CatalogCache) rngIntn(n int) int { + if n <= 1 { + return 0 + } + c.rngMu.Lock() + defer c.rngMu.Unlock() + return c.rng.Intn(n) +} diff --git a/pkg/server/catalog_cache_test.go b/pkg/server/catalog_cache_test.go new file mode 100644 index 00000000..fe3031a0 --- /dev/null +++ b/pkg/server/catalog_cache_test.go @@ -0,0 +1,219 @@ +package server + +import ( + "context" + "errors" + "fmt" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/altinity/altinity-mcp/pkg/clickhouse" + "github.com/altinity/altinity-mcp/pkg/config" + "github.com/stretchr/testify/require" +) + +// noClientFactory is the ClientFactory used by tests that don't actually +// need a CH client — DiscoverTools short-circuits when len(rules)==0 (no +// factory invocation), so we can use this in fast-path tests by pairing +// it with an empty rules slice. Tests that need to drive discovery +// produce a custom factory inline. +var noClientFactory ClientFactory = func(ctx context.Context, chCfg config.ClickHouseConfig) (*clickhouse.Client, error) { + return nil, errors.New("noClientFactory: should not be called when rules are empty") +} + +func TestCatalogCache_HitOK(t *testing.T) { + t.Parallel() + c := newTestCache(t, 100, 15*time.Minute, 60*time.Second) + defer c.Close() + + // First call: miss, runs (no-op) discovery, caches positive entry. + tools, err := c.GetOrDiscover(context.Background(), "k1", "alpha", config.ClickHouseConfig{}, noClientFactory, nil, true, time.Time{}) + require.NoError(t, err) + require.NotNil(t, tools) + require.Equal(t, uint64(1), c.Metrics.Misses.Load()) + + // Second call: hit. + _, err = c.GetOrDiscover(context.Background(), "k1", "alpha", config.ClickHouseConfig{}, noClientFactory, nil, true, time.Time{}) + require.NoError(t, err) + require.Equal(t, uint64(1), c.Metrics.HitsOK.Load()) +} + +func TestCatalogCache_BearerExpBoundsTTL(t *testing.T) { + t.Parallel() + c := newTestCache(t, 100, time.Hour, 60*time.Second) + defer c.Close() + + near := time.Now().Add(5 * time.Second) + _, err := c.GetOrDiscover(context.Background(), "k1", "alpha", config.ClickHouseConfig{}, noClientFactory, nil, true, near) + require.NoError(t, err) + + // Read the entry directly to inspect TTL. + c.mu.Lock() + e, ok := c.entries[fullKey("k1", "alpha")] + c.mu.Unlock() + require.True(t, ok) + require.WithinDuration(t, near, e.ExpiresAt, time.Second, "expiry capped by bearer exp") +} + +func TestCatalogCache_DenyCachedAndReturned(t *testing.T) { + t.Parallel() + c := newTestCache(t, 100, 15*time.Minute, 60*time.Second) + defer c.Close() + + rules := []config.DynamicToolRule{{Name: "x", Regexp: ".*", Type: "read"}} + authErr := errors.New("ClickHouse error: code: 516, message: AUTHENTICATION_FAILED") + factory := func(ctx context.Context, chCfg config.ClickHouseConfig) (*clickhouse.Client, error) { + return nil, authErr + } + + _, err := c.GetOrDiscover(context.Background(), "k1", "alpha", config.ClickHouseConfig{}, factory, rules, true, time.Time{}) + require.Error(t, err) + require.Equal(t, uint64(1), c.Metrics.DiscoveryAuthErr.Load()) + + // Second call: should hit denied cache without ever calling factory. + called := 0 + loud := func(ctx context.Context, chCfg config.ClickHouseConfig) (*clickhouse.Client, error) { + called++ + return nil, authErr + } + _, err = c.GetOrDiscover(context.Background(), "k1", "alpha", config.ClickHouseConfig{}, loud, rules, true, time.Time{}) + require.ErrorIs(t, err, ErrDiscoveryDenied) + require.Equal(t, 0, called, "denied entry must short-circuit the factory") + require.Equal(t, uint64(1), c.Metrics.HitsDenied.Load()) +} + +func TestCatalogCache_TransientErrorNotCached(t *testing.T) { + t.Parallel() + c := newTestCache(t, 100, 15*time.Minute, 60*time.Second) + defer c.Close() + + rules := []config.DynamicToolRule{{Name: "x", Regexp: ".*", Type: "read"}} + transient := errors.New("connection refused") + calls := atomic.Int32{} + factory := func(ctx context.Context, chCfg config.ClickHouseConfig) (*clickhouse.Client, error) { + calls.Add(1) + return nil, transient + } + + for i := 0; i < 3; i++ { + _, err := c.GetOrDiscover(context.Background(), "k1", "alpha", config.ClickHouseConfig{}, factory, rules, true, time.Time{}) + require.Error(t, err) + } + require.GreaterOrEqual(t, int(calls.Load()), 3, "transient errors must not be cached") + require.Equal(t, uint64(0), c.Metrics.DiscoveryAuthErr.Load()) +} + +func TestCatalogCache_PositivesNeverEvictedByThreshold(t *testing.T) { + t.Parallel() + c := newTestCache(t, 10, 15*time.Minute, 60*time.Second) + defer c.Close() + + // Insert 7 positives (70% fill). + for i := 0; i < 7; i++ { + _, err := c.GetOrDiscover(context.Background(), fmt.Sprintf("k%d", i), "alpha", config.ClickHouseConfig{}, noClientFactory, nil, true, time.Time{}) + require.NoError(t, err) + } + + // Insert several denied entries — they should occupy the remaining + // 3 slots, then start evicting each other. + rules := []config.DynamicToolRule{{Name: "x", Regexp: ".*", Type: "read"}} + auth := errors.New("Code: 497 ACCESS_DENIED") + authFactory := func(ctx context.Context, chCfg config.ClickHouseConfig) (*clickhouse.Client, error) { + return nil, auth + } + for i := 0; i < 20; i++ { + _, _ = c.GetOrDiscover(context.Background(), fmt.Sprintf("denied%d", i), "alpha", config.ClickHouseConfig{}, authFactory, rules, true, time.Time{}) + } + + // All 7 positives must still be present. + c.mu.Lock() + defer c.mu.Unlock() + for i := 0; i < 7; i++ { + e, ok := c.entries[fullKey(fmt.Sprintf("k%d", i), "alpha")] + require.True(t, ok, "positive k%d evicted unexpectedly", i) + require.Equal(t, catalogOutcomeOK, e.Outcome) + } +} + +func TestCatalogCache_SingleflightCollapsesConcurrentMisses(t *testing.T) { + t.Parallel() + c := newTestCache(t, 100, 15*time.Minute, 60*time.Second) + defer c.Close() + + gate := make(chan struct{}) + calls := atomic.Int32{} + factory := func(ctx context.Context, chCfg config.ClickHouseConfig) (*clickhouse.Client, error) { + calls.Add(1) + <-gate + return nil, errors.New("ok-but-empty") + } + // Empty rules + custom error so we don't depend on producing tools. + // Use real rule so factory is invoked. + rules := []config.DynamicToolRule{{Name: "x", Regexp: ".*", Type: "read"}} + + const N = 8 + var wg sync.WaitGroup + wg.Add(N) + for i := 0; i < N; i++ { + go func() { + defer wg.Done() + _, _ = c.GetOrDiscover(context.Background(), "shared", "alpha", config.ClickHouseConfig{}, factory, rules, true, time.Time{}) + }() + } + // Give the goroutines time to all enter singleflight. + time.Sleep(50 * time.Millisecond) + close(gate) + wg.Wait() + require.Equal(t, int32(1), calls.Load(), "singleflight must collapse N concurrent misses to 1 discovery") +} + +func TestCatalogCache_DiscoverySaturation(t *testing.T) { + t.Parallel() + c := newTestCache(t, 100, 15*time.Minute, 60*time.Second) + defer c.Close() + + hold := make(chan struct{}) + rules := []config.DynamicToolRule{{Name: "x", Regexp: ".*", Type: "read"}} + factory := func(ctx context.Context, chCfg config.ClickHouseConfig) (*clickhouse.Client, error) { + <-hold + return nil, errors.New("never") + } + + // Fill all maxConcurrentDiscoveries slots with distinct keys. + var wg sync.WaitGroup + for i := 0; i < maxConcurrentDiscoveries; i++ { + wg.Add(1) + go func(i int) { + defer wg.Done() + _, _ = c.GetOrDiscover(context.Background(), fmt.Sprintf("hot%d", i), "alpha", config.ClickHouseConfig{}, factory, rules, true, time.Time{}) + }(i) + } + // Wait for all to acquire the semaphore. + require.Eventually(t, func() bool { + return len(c.sem) == maxConcurrentDiscoveries + }, time.Second, 10*time.Millisecond) + + // Next call with a short ctx must fail-fast on the semaphore wait. + ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond) + defer cancel() + _, err := c.GetOrDiscover(ctx, "extra", "alpha", config.ClickHouseConfig{}, factory, rules, true, time.Time{}) + require.ErrorIs(t, err, ErrDiscoverySaturated) + require.GreaterOrEqual(t, c.Metrics.DiscoverySaturate.Load(), uint64(1)) + + // Release the held goroutines and wait for them to finish so the + // test's resources (and the semaphore) are clean for the next test. + close(hold) + wg.Wait() +} + +func newTestCache(t *testing.T, max int, fallback, neg time.Duration) *CatalogCache { + t.Helper() + return NewCatalogCache(config.MulticlusterConfig{ + Enabled: true, + CatalogCacheMax: max, + CatalogTTLFallback: fallback, + CatalogNegativeTTL: neg, + }) +} diff --git a/pkg/server/multicluster_factory.go b/pkg/server/multicluster_factory.go new file mode 100644 index 00000000..3880a4be --- /dev/null +++ b/pkg/server/multicluster_factory.go @@ -0,0 +1,126 @@ +package server + +import ( + "context" + "net/http" + + "github.com/altinity/altinity-mcp/pkg/clickhouse" + "github.com/altinity/altinity-mcp/pkg/config" + "github.com/modelcontextprotocol/go-sdk/mcp" + "github.com/rs/zerolog/log" +) + +// MulticlusterServerFactory mints a per-(bearer, cluster) *mcp.Server on +// each incoming request. Single-cluster mode uses one *mcp.Server for the +// process; multi-cluster mode cannot, because per-tenant dynamic tools +// would poison cross-tenant tools/list. +// +// The factory holds the long-lived (cfg, cache, ClickHouseJWEServer) +// triple and exposes a single GetServer(r *http.Request) entry point +// shaped for mcp.NewStreamableHTTPHandler's first argument. +type MulticlusterServerFactory struct { + cfg config.Config + parent *ClickHouseJWEServer + cache *CatalogCache + clientFn ClientFactory + dynamicRules []config.DynamicToolRule + version string + instrName string + instrTitle string +} + +// NewMulticlusterServerFactory wires up the factory. parent supplies +// per-request OAuth/JWE extraction helpers and the CH client builder; +// cache is the catalog cache; cfg.Multicluster must be Enabled. +func NewMulticlusterServerFactory( + cfg config.Config, + parent *ClickHouseJWEServer, + cache *CatalogCache, + version string, +) *MulticlusterServerFactory { + f := &MulticlusterServerFactory{ + cfg: cfg, + parent: parent, + cache: cache, + // Dynamic-tool rules must come off the parent server, not cfg. + // RegisterTools (run inside NewClickHouseMCPServer) converts the + // unified `tools:` config into Config.Server.DynamicTools on the + // parent's own config copy; the cfg value handed to this factory is + // a separate copy that never saw that conversion. Reading + // cfg.Server.DynamicTools here would silently miss every dynamic + // rule declared via the unified `tools:` block (only the deprecated + // dynamic_tools: key, parsed straight from YAML, would survive). + dynamicRules: parent.Config.Server.DynamicTools, + version: version, + instrName: "Altinity ClickHouse MCP Server", + instrTitle: "Altinity ClickHouse MCP Server (multi-cluster)", + } + f.clientFn = func(ctx context.Context, chCfg config.ClickHouseConfig) (*clickhouse.Client, error) { + jweToken := parent.ExtractTokenFromCtx(ctx) + oauthToken := parent.ExtractOAuthTokenFromCtx(ctx) + oauthClaims := parent.GetOAuthClaimsFromCtx(ctx) + return parent.GetClickHouseClientWithOAuthForConfig(ctx, chCfg, jweToken, oauthToken, oauthClaims) + } + return f +} + +// GetServer is the callback handed to mcp.NewStreamableHTTPHandler. It +// runs after the multi-cluster router middleware (which has injected +// cluster name + per-request chCfg on ctx) and after the OAuth injector +// (which has placed the bearer on ctx). Looks up the catalog cache and +// mints a fresh *mcp.Server populated with the right tool set. +func (f *MulticlusterServerFactory) GetServer(r *http.Request) *mcp.Server { + ctx := r.Context() + cluster, hasCluster := ClusterFromContext(ctx) + if !hasCluster { + // Defensive — the router should have 404'd. Return a static-only + // server so the SDK can still respond with method-not-found rather + // than nil-panic. + return f.newServer(nil) + } + bearer := f.parent.ExtractOAuthTokenFromCtx(ctx) + if bearer == "" { + return f.newServer(nil) + } + reqCfg := CHConfigFromContext(ctx, f.cfg.ClickHouse) + key := CacheKey(bearer) + exp, _ := BearerExp(bearer) + + tools, err := f.cache.GetOrDiscover(ctx, key, cluster, reqCfg, f.clientFn, + f.dynamicRules, f.cfg.ClickHouse.ReadOnly, exp) + if err != nil { + log.Warn().Err(err).Str("cluster", cluster).Msg("multicluster: discovery failed; static-only") + return f.newServer(nil) + } + return f.newServer(tools) +} + +// newServer builds a fresh *mcp.Server with resources, prompts, static +// tools, and (optionally) the per-tenant discovered dynamic tools. +func (f *MulticlusterServerFactory) newServer(dynamicTools map[string]dynamicToolMeta) *mcp.Server { + opts := &mcp.ServerOptions{ + Instructions: f.instrTitle + " - A Model Context Protocol server for interacting with ClickHouse databases", + HasTools: true, + HasResources: true, + HasPrompts: true, + } + srv := mcp.NewServer(&mcp.Implementation{ + Name: f.instrName, + Version: f.version, + }, opts) + adapter := NewSDKServerAdapter(srv) + RegisterResources(adapter) + RegisterPrompts(adapter) + RegisterStaticToolsOn(adapter, &f.cfg) + if len(dynamicTools) > 0 { + registerDynamicToolsOn(adapter, dynamicTools, f.cfg.Server.ToolInputSettings, nil) + } + return srv +} + +// ValidateClusterAllowed reports whether the cluster name passes the +// validation + allowlist gates. Exported so cmd/altinity-mcp can reject +// /.well-known/.../mcp/{bogus} before delegating to the existing PRM handler. +func (r *MulticlusterRouter) ValidateClusterAllowed(cluster string) (config.ClickHouseConfig, bool) { + return r.resolveCluster(cluster) +} diff --git a/pkg/server/multicluster_factory_test.go b/pkg/server/multicluster_factory_test.go new file mode 100644 index 00000000..46d6fa52 --- /dev/null +++ b/pkg/server/multicluster_factory_test.go @@ -0,0 +1,45 @@ +package server + +import ( + "testing" + "time" + + "github.com/altinity/altinity-mcp/pkg/config" + "github.com/stretchr/testify/require" +) + +// TestMulticlusterFactory_PicksUpUnifiedDynamicRules guards a regression: +// dynamic-tool rules declared via the unified `tools:` block are converted +// into Config.Server.DynamicTools by RegisterTools — but on the *parent +// server's* config copy, not on the value handed to the factory. The +// factory must therefore source its discovery rules from the parent, or +// every unified dynamic rule is silently dropped in multi-cluster mode. +func TestMulticlusterFactory_PicksUpUnifiedDynamicRules(t *testing.T) { + t.Parallel() + cfg := config.Config{ + ClickHouse: config.ClickHouseConfig{Host: "chi-{cluster}.demo"}, + Server: config.ServerConfig{ + Tools: []config.ToolDefinition{ + {Type: "read", ViewRegexp: "^analytics\\..*"}, + }, + }, + } + parent := NewClickHouseMCPServer(cfg, "test") + + // Sanity: the unified rule did NOT land on the value-copy cfg... + require.Empty(t, cfg.Server.DynamicTools, "conversion must not mutate caller's cfg copy") + // ...but DID land on the parent server's config copy. + require.Len(t, parent.Config.Server.DynamicTools, 1, "RegisterTools converts unified tools onto the parent") + + cache := NewCatalogCache(config.MulticlusterConfig{ + Enabled: true, + CatalogCacheMax: 100, + CatalogTTLFallback: 15 * time.Minute, + CatalogNegativeTTL: time.Minute, + }) + defer cache.Close() + + f := NewMulticlusterServerFactory(cfg, parent, cache, "test") + require.Len(t, f.dynamicRules, 1, "factory must inherit the resolved dynamic rules from the parent, not the bare cfg") + require.Equal(t, "^analytics\\..*", f.dynamicRules[0].Regexp) +} diff --git a/pkg/server/multicluster_identity.go b/pkg/server/multicluster_identity.go new file mode 100644 index 00000000..f7afdccd --- /dev/null +++ b/pkg/server/multicluster_identity.go @@ -0,0 +1,156 @@ +package server + +import ( + "context" + "crypto/sha256" + "encoding/base64" + "encoding/hex" + "encoding/json" + "errors" + "net" + "strings" + "time" +) + +// CacheKey derives the catalog cache key from a raw bearer token. Returns +// "tok:" + sha256_hex(bearer). The "tok:" prefix is purely for log +// readability — the security property is that the cache key is bound to +// the literal bearer bytes, so token rotation invalidates the cache and a +// forged claim set cannot reuse a victim's cached catalog. The cluster +// name is appended separately by the catalog cache to give a final key of +// shape "\x00". +func CacheKey(bearer string) string { + sum := sha256.Sum256([]byte(bearer)) + return "tok:" + hex.EncodeToString(sum[:]) +} + +// BearerExp extracts the `exp` (RFC 7519 §4.1.4) claim from an unverified +// JWT bearer. Returns (zero, false) for opaque bearers, malformed JWTs, or +// JWTs without an exp claim. The signature is NOT validated — this is +// purely a TTL hint used to bound a positive catalog cache entry's +// expiration; the actual signature/iss/aud/exp validation is done by the +// sidecar at query time. +func BearerExp(bearer string) (time.Time, bool) { + parts := strings.Split(strings.TrimSpace(bearer), ".") + if len(parts) != 3 { + return time.Time{}, false + } + payload, err := base64.RawURLEncoding.DecodeString(parts[1]) + if err != nil { + payload, err = base64.URLEncoding.DecodeString(parts[1]) + if err != nil { + return time.Time{}, false + } + } + var claims struct { + Exp json.Number `json:"exp"` + } + if err := json.Unmarshal(payload, &claims); err != nil { + return time.Time{}, false + } + if claims.Exp == "" { + return time.Time{}, false + } + expInt, err := claims.Exp.Int64() + if err != nil { + // Spec says NumericDate but tolerate fractional seconds. + if expFloat, ferr := claims.Exp.Float64(); ferr == nil && expFloat > 0 { + return time.Unix(int64(expFloat), 0), true + } + return time.Time{}, false + } + if expInt <= 0 { + return time.Time{}, false + } + return time.Unix(expInt, 0), true +} + +// DiscoveryErrorClass categorises tool-discovery errors for the catalog +// cache. Only "auth"-class errors are memoised in the negative cache; +// transient errors (timeouts, 5xx, DNS, connection refused) are deliberately +// not cached so the next request reconnects. +type DiscoveryErrorClass string + +const ( + // DiscoveryAuthError indicates the user is not allowed to query this + // cluster (HTTP 401/403 from CH, or CH-side auth-class codes 516/519/497). + // These are memoised as a "denied" catalog entry for catalog_negative_ttl. + DiscoveryAuthError DiscoveryErrorClass = "auth" + // DiscoveryTransientError covers timeouts, DNS, 5xx, refused — anything + // where retrying might succeed. Not cached. + DiscoveryTransientError DiscoveryErrorClass = "transient" +) + +// ClassifyDiscoveryError reports whether err is an auth-class failure +// (worth memoising as a denied entry in the catalog cache) or a transient +// one (not cached). The set of "auth-class" CH exception codes is +// deliberately small: 516 (AUTHENTICATION_FAILED), 519 (NOT_ENOUGH_PRIVILEGES), +// 497 (ACCESS_DENIED). Everything else falls through to transient — the +// catalog cache will not stamp out a recoverable infrastructure blip. +func ClassifyDiscoveryError(err error) (auth bool, class DiscoveryErrorClass) { + if err == nil { + return false, "" + } + msg := err.Error() + + // net/http 401/403 surfaces and embedded test rigs. Match on common + // substrings rather than typed errors — the ClickHouse driver returns + // wrapped errors whose underlying type varies across protocol+transport + // combinations. + if strings.Contains(msg, "401") || strings.Contains(msg, "Unauthorized") || + strings.Contains(msg, "403") || strings.Contains(msg, "Forbidden") { + return true, DiscoveryAuthError + } + + if strings.Contains(msg, "Token authentication is not configured") { + return true, DiscoveryAuthError + } + + // CH exception codes — encoded into the error text by the driver as + // "code: NNN". Auth-class set: 497 (ACCESS_DENIED), 516 + // (AUTHENTICATION_FAILED), 519 (NOT_ENOUGH_PRIVILEGES). + if containsCHCode(msg, 497) || containsCHCode(msg, 516) || containsCHCode(msg, 519) { + return true, DiscoveryAuthError + } + + // Network / timeout / DNS — explicitly transient. + if errors.Is(err, context.DeadlineExceeded) { + return false, DiscoveryTransientError + } + var netErr net.Error + if errors.As(err, &netErr) { + return false, DiscoveryTransientError + } + + // Default: treat unknown as transient. Better to refetch than to + // stamp out a flaky backend with a cached "denied". + return false, DiscoveryTransientError +} + +// containsCHCode reports whether msg contains a ClickHouse exception code +// of the shape "code: NNN" or "Code: NNN" — emitted by both the native +// and HTTP drivers in the error text. +func containsCHCode(msg string, code int) bool { + needle := "code: " + idx := strings.Index(strings.ToLower(msg), needle) + if idx < 0 { + return false + } + tail := msg[idx+len(needle):] + if len(tail) < 3 { + return false + } + // Compare the leading digits to code. + digits := 0 + for digits < len(tail) && tail[digits] >= '0' && tail[digits] <= '9' { + digits++ + } + if digits == 0 { + return false + } + parsed := 0 + for i := 0; i < digits; i++ { + parsed = parsed*10 + int(tail[i]-'0') + } + return parsed == code +} diff --git a/pkg/server/multicluster_identity_test.go b/pkg/server/multicluster_identity_test.go new file mode 100644 index 00000000..1b854e2c --- /dev/null +++ b/pkg/server/multicluster_identity_test.go @@ -0,0 +1,110 @@ +package server + +import ( + "context" + "encoding/base64" + "encoding/json" + "errors" + "fmt" + "testing" + "time" + + "github.com/stretchr/testify/require" +) + +func TestCacheKey(t *testing.T) { + t.Parallel() + k1 := CacheKey("aaa") + k2 := CacheKey("aaa") + k3 := CacheKey("aab") + require.Equal(t, k1, k2, "stable for same bearer") + require.NotEqual(t, k1, k3, "diverges on token rotation") + require.True(t, len(k1) > len("tok:"), "non-empty hex body") +} + +func TestCacheKey_PreventsForgedClaimReuse(t *testing.T) { + t.Parallel() + // Two different bearer strings — even if their JWT payloads have an + // identical email claim — must hash to different cache keys. This is + // the v1.0 forgery defence: cache key is bound to the literal bearer + // bytes, not to a parsed sub/email claim that an attacker could + // match by emitting their own JWT with the same email. + a := makeFakeJWT(t, map[string]any{"email": "alice@example.com", "sub": "user1"}) + b := makeFakeJWT(t, map[string]any{"email": "alice@example.com", "sub": "user2"}) + require.NotEqual(t, CacheKey(a), CacheKey(b)) +} + +func TestBearerExp_JWT(t *testing.T) { + t.Parallel() + want := time.Now().Add(45 * time.Minute).Unix() + tok := makeFakeJWT(t, map[string]any{"exp": want}) + got, ok := BearerExp(tok) + require.True(t, ok) + require.Equal(t, want, got.Unix()) +} + +func TestBearerExp_FractionalExp(t *testing.T) { + t.Parallel() + want := float64(time.Now().Add(time.Minute).Unix()) + 0.5 + tok := makeFakeJWT(t, map[string]any{"exp": want}) + got, ok := BearerExp(tok) + require.True(t, ok) + require.Equal(t, int64(want), got.Unix()) +} + +func TestBearerExp_OpaqueAndMalformed(t *testing.T) { + t.Parallel() + _, ok := BearerExp("opaque-token") + require.False(t, ok) + _, ok = BearerExp("") + require.False(t, ok) + // 3 parts but bad base64 + _, ok = BearerExp("not.a.jwt") + require.False(t, ok) +} + +func TestBearerExp_NoExpClaim(t *testing.T) { + t.Parallel() + tok := makeFakeJWT(t, map[string]any{"email": "x@y.z"}) + _, ok := BearerExp(tok) + require.False(t, ok) +} + +func TestClassifyDiscoveryError(t *testing.T) { + t.Parallel() + cases := []struct { + name string + err error + wantAuth bool + }{ + {"nil", nil, false}, + {"401", errors.New("http 401 Unauthorized"), true}, + {"403", errors.New("http 403 Forbidden"), true}, + {"code-516", errors.New("clickhouse: code: 516, message: ..."), true}, + {"code-519", errors.New("Code: 519 NOT_ENOUGH_PRIVILEGES"), true}, + {"code-497", errors.New("Code: 497 ACCESS_DENIED"), true}, + {"deadline", context.DeadlineExceeded, false}, + {"random-500", errors.New("http 500 oops"), false}, + {"connection-refused", errors.New("connection refused"), false}, + } + for _, c := range cases { + c := c + t.Run(c.name, func(t *testing.T) { + t.Parallel() + gotAuth, _ := ClassifyDiscoveryError(c.err) + require.Equal(t, c.wantAuth, gotAuth, "err=%v", c.err) + }) + } +} + +// makeFakeJWT crafts a base64-URL-encoded fake JWT with the given claims +// (signature ignored — these helpers never validate). +func makeFakeJWT(t *testing.T, claims map[string]any) string { + t.Helper() + header := base64.RawURLEncoding.EncodeToString([]byte(`{"alg":"HS256"}`)) + payloadJSON, err := json.Marshal(claims) + require.NoError(t, err) + payload := base64.RawURLEncoding.EncodeToString(payloadJSON) + sig := base64.RawURLEncoding.EncodeToString([]byte("nope")) + return fmt.Sprintf("%s.%s.%s", header, payload, sig) +} diff --git a/pkg/server/multicluster_router.go b/pkg/server/multicluster_router.go new file mode 100644 index 00000000..58b55db8 --- /dev/null +++ b/pkg/server/multicluster_router.go @@ -0,0 +1,89 @@ +package server + +import ( + "fmt" + "net/http" + "regexp" + "strings" + + "github.com/altinity/altinity-mcp/pkg/config" + "github.com/rs/zerolog/log" +) + +// clusterNameRegex matches RFC 1123 DNS labels (lowercase a-z, 0-9, and +// '-', not starting or ending with '-', total length 1..63). Excludes +// leading dots — the load-bearing property is that no valid cluster name +// can collide with /.well-known/* prefixes under any future mount that +// might overlap. +var clusterNameRegex = regexp.MustCompile(`^[a-z0-9]([-a-z0-9]{0,61}[a-z0-9])?$`) + +// IsValidClusterName reports whether name passes the RFC 1123 DNS label +// check. Exported for tests and validation paths in cmd/altinity-mcp. +func IsValidClusterName(name string) bool { + return clusterNameRegex.MatchString(name) +} + +// MulticlusterRouter validates the {cluster} URL path value and expands +// the ClickHouseConfig.Host template into a per-request config that +// downstream middleware reads via CHConfigFromContext. +type MulticlusterRouter struct { + cfg config.MulticlusterConfig + ch config.ClickHouseConfig + allowlistSet map[string]struct{} +} + +// NewMulticlusterRouter builds a router. Returns an error if any +// allowlist entry fails the cluster-name regex. +func NewMulticlusterRouter(mc config.MulticlusterConfig, ch config.ClickHouseConfig) (*MulticlusterRouter, error) { + allowlist := make(map[string]struct{}, len(mc.ClusterAllowlist)) + for _, name := range mc.ClusterAllowlist { + trimmed := strings.TrimSpace(name) + if trimmed == "" { + continue + } + if !IsValidClusterName(trimmed) { + return nil, fmt.Errorf("multicluster: cluster_allowlist entry %q is not a valid RFC 1123 DNS label", trimmed) + } + allowlist[trimmed] = struct{}{} + } + return &MulticlusterRouter{ + cfg: mc, + ch: ch, + allowlistSet: allowlist, + }, nil +} + +// resolveCluster validates the path-extracted cluster name and returns the +// per-request ClickHouseConfig. Returns ok=false on any rejection. +func (r *MulticlusterRouter) resolveCluster(cluster string) (config.ClickHouseConfig, bool) { + if !IsValidClusterName(cluster) { + return config.ClickHouseConfig{}, false + } + if len(r.allowlistSet) > 0 { + if _, allowed := r.allowlistSet[cluster]; !allowed { + return config.ClickHouseConfig{}, false + } + } + reqCfg := r.ch // copy + reqCfg.Host = strings.ReplaceAll(r.ch.Host, "{cluster}", cluster) + return reqCfg, true +} + +// Middleware extracts {cluster} from r.PathValue, validates it, expands +// the host template, and injects both the cluster name and the per-request +// ClickHouseConfig on the context. Invalid or non-allowlisted clusters get +// a 404 response without leaking which check failed. +func (r *MulticlusterRouter) Middleware(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) { + cluster := req.PathValue("cluster") + reqCfg, ok := r.resolveCluster(cluster) + if !ok { + log.Debug().Str("cluster", cluster).Msg("multicluster: cluster rejected (invalid name or not in allowlist)") + http.NotFound(w, req) + return + } + ctx := WithCluster(req.Context(), cluster) + ctx = WithRequestCHConfig(ctx, reqCfg) + next.ServeHTTP(w, req.WithContext(ctx)) + }) +} diff --git a/pkg/server/multicluster_router_test.go b/pkg/server/multicluster_router_test.go new file mode 100644 index 00000000..ceb6e551 --- /dev/null +++ b/pkg/server/multicluster_router_test.go @@ -0,0 +1,136 @@ +package server + +import ( + "net/http" + "net/http/httptest" + "testing" + + "github.com/altinity/altinity-mcp/pkg/config" + "github.com/stretchr/testify/require" +) + +func TestIsValidClusterName(t *testing.T) { + t.Parallel() + cases := []struct { + name string + in string + want bool + }{ + {"basic", "otel", true}, + {"with-dash", "my-cluster", true}, + {"digits", "cluster1", true}, + {"single-char", "a", true}, + {"empty", "", false}, + {"uppercase", "Otel", false}, + {"leading-dot", ".well-known", false}, + {"with-dot", "evil.example", false}, + {"trailing-dash", "cluster-", false}, + {"leading-dash", "-cluster", false}, + {"ipv4-literal", "10.0.0.1", false}, + {"slash", "a/b", false}, + {"too-long", "a-very-long-name-that-exceeds-the-rfc-1123-dns-label-limit-of-63-characters-and-then-some", false}, + {"underscore", "my_cluster", false}, + {"space", "my cluster", false}, + } + for _, c := range cases { + c := c + t.Run(c.name, func(t *testing.T) { + t.Parallel() + require.Equal(t, c.want, IsValidClusterName(c.in), "case %q (%q)", c.name, c.in) + }) + } +} + +func TestMulticlusterRouter_AllowlistRejection(t *testing.T) { + t.Parallel() + mc := config.MulticlusterConfig{ + Enabled: true, + ClusterAllowlist: []string{"otel", "antalya"}, + } + ch := config.ClickHouseConfig{Host: "chi-{cluster}-{cluster}-0-0.demo", Port: 8443} + router, err := NewMulticlusterRouter(mc, ch) + require.NoError(t, err) + + tests := []struct { + cluster string + want int + }{ + {"otel", 200}, + {"antalya", 200}, + {"bogus", 404}, + {"evil.example", 404}, + {"", 404}, + {".well-known", 404}, + } + + next := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + cluster, ok := ClusterFromContext(r.Context()) + require.True(t, ok, "expected cluster on ctx") + require.NotEmpty(t, cluster) + cfg := CHConfigFromContext(r.Context(), config.ClickHouseConfig{}) + require.Contains(t, cfg.Host, cluster) + require.NotContains(t, cfg.Host, "{cluster}") + w.WriteHeader(200) + }) + handler := router.Middleware(next) + + for _, tc := range tests { + tc := tc + t.Run(tc.cluster, func(t *testing.T) { + rr := httptest.NewRecorder() + // Use mux pattern so r.PathValue works. + mux := http.NewServeMux() + mux.Handle("/mcp/{cluster}", handler) + req := httptest.NewRequest("GET", "/mcp/"+tc.cluster, nil) + mux.ServeHTTP(rr, req) + require.Equal(t, tc.want, rr.Code, "cluster=%q", tc.cluster) + }) + } +} + +func TestMulticlusterRouter_HostExpansion(t *testing.T) { + t.Parallel() + mc := config.MulticlusterConfig{ + Enabled: true, + ClusterAllowlist: []string{"alpha"}, + } + ch := config.ClickHouseConfig{Host: "chi-{cluster}-{cluster}-0-0.demo", Port: 8443} + router, err := NewMulticlusterRouter(mc, ch) + require.NoError(t, err) + + cfg, ok := router.ValidateClusterAllowed("alpha") + require.True(t, ok) + require.Equal(t, "chi-alpha-alpha-0-0.demo", cfg.Host) + // Other fields preserved. + require.Equal(t, 8443, cfg.Port) +} + +func TestMulticlusterRouter_EmptyAllowlistAllowsAny(t *testing.T) { + t.Parallel() + // When allowlist is empty, *any* valid RFC 1123 name is allowed. + // This is deliberate: operators sometimes want to admit all clusters + // in a namespace; the strict check is still the DNS-label regex. + mc := config.MulticlusterConfig{Enabled: true} + ch := config.ClickHouseConfig{Host: "chi-{cluster}.demo"} + router, err := NewMulticlusterRouter(mc, ch) + require.NoError(t, err) + + cfg, ok := router.ValidateClusterAllowed("anything") + require.True(t, ok) + require.Equal(t, "chi-anything.demo", cfg.Host) + + _, ok = router.ValidateClusterAllowed("evil.example") + require.False(t, ok) +} + +func TestNewMulticlusterRouter_RejectsBadAllowlist(t *testing.T) { + t.Parallel() + mc := config.MulticlusterConfig{ + Enabled: true, + ClusterAllowlist: []string{"good", "Bad-Caps"}, + } + ch := config.ClickHouseConfig{Host: "chi-{cluster}.demo"} + _, err := NewMulticlusterRouter(mc, ch) + require.Error(t, err) + require.Contains(t, err.Error(), "Bad-Caps") +} diff --git a/pkg/server/oauth_e2e_test.go b/pkg/server/oauth_e2e_test.go index 0503b650..db214057 100644 --- a/pkg/server/oauth_e2e_test.go +++ b/pkg/server/oauth_e2e_test.go @@ -28,7 +28,7 @@ import ( // 1. A lightweight mock OIDC provider (in-process Go HTTP server bound to 127.0.0.1) // 2. Altinity Antalya ClickHouse with token_processors for JWT auth, run as a host // subprocess via embedded-clickhouse + an extracted Antalya binary -// 3. MCP server forwarding Bearer tokens to ClickHouse +// 3. MCP server presenting Bearer tokens to ClickHouse // // On Linux the Antalya binary is auto-extracted from the published Docker // image on first run. On macOS/other hosts there is no published binary, @@ -76,7 +76,6 @@ func TestOAuthE2EWithMockOIDC(t *testing.T) { Server: config.ServerConfig{ OAuth: config.OAuthConfig{ Enabled: true, - Mode: "forward", }, }, }, "test-e2e") @@ -164,7 +163,6 @@ func TestOAuthE2EWithMockOIDC(t *testing.T) { Server: config.ServerConfig{ OAuth: config.OAuthConfig{ Enabled: true, - Mode: "forward", }, }, }, "test-e2e") @@ -316,7 +314,7 @@ func setupEmbeddedAntalyaWithOIDC(t *testing.T, oidcDiscoveryURL string) config. // token_processors parser requires (issuer + auth/token/jwks/userinfo/ // introspection endpoints + response_types + subject_types + id_token_signing // algs). The shorter doc returned by newTestOAuthProvider is enough for -// forward-mode bearer-passthrough tests that never look at it — Antalya +// OAuth bearer-passthrough tests that never look at it — Antalya // rejects it with "Cannot extract userinfo_endpoint or introspection_endpoint". // // Uses an explicit net.Listen + http.Server.Serve loop instead of diff --git a/pkg/server/server.go b/pkg/server/server.go index 981b7b7f..00a065a7 100644 --- a/pkg/server/server.go +++ b/pkg/server/server.go @@ -29,6 +29,10 @@ type ClickHouseJWEServer struct { // from the verifier() getter. oauthVerifier *oauth.Verifier blockedClauses map[string]bool + // chOAuthMethodCache stores the discovered CH auth method (Bearer vs Basic) + // keyed by "host:port". Populated on the first OAuth request to an endpoint; + // cleared on config reload so operator changes take effect without restart. + chOAuthMethodCache sync.Map } // ToolHandlerFunc is a function type for tool handlers @@ -148,6 +152,33 @@ func truncateErrForClient(err error) string { return msg } +// RegisterStaticToolsOn registers the static tools described by cfg's +// resolved Tools list on srv, without touching cfg.Server.DynamicTools or +// the parent ClickHouseJWEServer's state. Used by multi-cluster mode where +// each (bearer, cluster) request mints a fresh *mcp.Server. +// +// Returns the number of static tools actually registered. +func RegisterStaticToolsOn(srv AltinityMCPServer, cfg *config.Config) int { + toolsToRegister := resolveToolDefinitions(cfg) + staticToolCount := 0 + for _, td := range toolsToRegister { + if td.Type != "read" && td.Type != "write" { + continue + } + if td.ViewRegexp != "" || td.TableRegexp != "" { + // Dynamic rule — handled by RegisterDynamicToolsOn after discovery. + continue + } + if td.Name == "" { + continue + } + if registerStaticTool(srv, td, &cfg.Server, cfg.ClickHouse.ReadOnly) { + staticToolCount++ + } + } + return staticToolCount +} + // RegisterTools adds ClickHouse tools to the MCP server. It accepts either // the new unified Tools configuration or the legacy DynamicTools form // (deprecated; converted automatically with a warning). With no config, @@ -547,4 +578,91 @@ const ( JWETokenKey contextKey = "jwe_token" JWEClaimsKey contextKey = "jwe_claims" CHJWEServerKey contextKey = "clickhouse_jwe_server" + // clusterNameKey carries the (validated) {cluster} URL path value in + // multi-cluster mode. Unexported; access via ClusterFromContext. + clusterNameKey contextKey = "altinity_mcp_cluster" + // requestCHConfigKey carries the per-request ClickHouseConfig built by + // the multi-cluster router (host template expanded with {cluster}). + // Unexported; access via CHConfigFromContext. + requestCHConfigKey contextKey = "altinity_mcp_request_ch_config" ) + +// ClusterFromContext returns the cluster name injected by the +// multi-cluster router, if any. In single-cluster mode the router is not +// mounted and this returns ("", false). +func ClusterFromContext(ctx context.Context) (string, bool) { + v := ctx.Value(clusterNameKey) + if v == nil { + return "", false + } + name, ok := v.(string) + if !ok || name == "" { + return "", false + } + return name, true +} + +// CHConfigFromContext returns the per-request ClickHouse config (host +// templated for the active cluster) when the multi-cluster router has run, +// or s.Config.ClickHouse otherwise. Callers should always prefer this over +// reaching for s.Config.ClickHouse directly on a per-request hot path. +func CHConfigFromContext(ctx context.Context, fallback config.ClickHouseConfig) config.ClickHouseConfig { + if v := ctx.Value(requestCHConfigKey); v != nil { + if cfg, ok := v.(config.ClickHouseConfig); ok { + return cfg + } + } + return fallback +} + +// WithRequestCHConfig stores a per-request ClickHouseConfig on ctx. Used by +// the multi-cluster router; exported for tests. +func WithRequestCHConfig(ctx context.Context, chCfg config.ClickHouseConfig) context.Context { + return context.WithValue(ctx, requestCHConfigKey, chCfg) +} + +// WithCluster stores the cluster name on ctx. Used by the multi-cluster +// router; exported for tests. +func WithCluster(ctx context.Context, cluster string) context.Context { + return context.WithValue(ctx, clusterNameKey, cluster) +} + +// sdkServerAdapter wraps a raw *mcp.Server so multi-cluster code can reuse +// the AltinityMCPServer-shaped register helpers (RegisterResources, +// RegisterPrompts, registerStaticTool, registerDynamicToolsOn). In +// single-cluster mode RegisterTools and friends already accept the +// ClickHouseJWEServer directly; in multi-cluster mode each (bearer, +// cluster) request builds a fresh *mcp.Server with its own discovered +// tools, and we use this adapter on top of it. +type sdkServerAdapter struct { + srv *mcp.Server +} + +// NewSDKServerAdapter wraps srv into something callable as AltinityMCPServer. +func NewSDKServerAdapter(srv *mcp.Server) AltinityMCPServer { + return &sdkServerAdapter{srv: srv} +} + +func (a *sdkServerAdapter) AddTool(tool *mcp.Tool, handler ToolHandlerFunc) { + a.srv.AddTool(tool, func(ctx context.Context, req *mcp.CallToolRequest) (*mcp.CallToolResult, error) { + return handler(ctx, req) + }) +} + +func (a *sdkServerAdapter) AddResource(resource *mcp.Resource, handler ResourceHandlerFunc) { + a.srv.AddResource(resource, func(ctx context.Context, req *mcp.ReadResourceRequest) (*mcp.ReadResourceResult, error) { + return handler(ctx, req) + }) +} + +func (a *sdkServerAdapter) AddResourceTemplate(template *mcp.ResourceTemplate, handler ResourceHandlerFunc) { + a.srv.AddResourceTemplate(template, func(ctx context.Context, req *mcp.ReadResourceRequest) (*mcp.ReadResourceResult, error) { + return handler(ctx, req) + }) +} + +func (a *sdkServerAdapter) AddPrompt(prompt *mcp.Prompt, handler PromptHandlerFunc) { + a.srv.AddPrompt(prompt, func(ctx context.Context, req *mcp.GetPromptRequest) (*mcp.GetPromptResult, error) { + return handler(ctx, req) + }) +} diff --git a/pkg/server/server_auth_oauth_test.go b/pkg/server/server_auth_oauth_test.go index b92990fb..8a8b7b86 100644 --- a/pkg/server/server_auth_oauth_test.go +++ b/pkg/server/server_auth_oauth_test.go @@ -21,9 +21,9 @@ import ( "github.com/stretchr/testify/require" ) -func mintSelfIssuedToken(t *testing.T, gatingSecret string, claims map[string]interface{}) string { +func mintSelfIssuedToken(t *testing.T, signingSecret string, claims map[string]interface{}) string { t.Helper() - hashedSecret := jwe_auth.HashSHA256([]byte(gatingSecret)) + hashedSecret := jwe_auth.HashSHA256([]byte(signingSecret)) signer, err := jose.NewSigner( jose.SigningKey{Algorithm: jose.HS256, Key: hashedSecret}, (&jose.SignerOptions{}).WithType("JWT"), @@ -287,7 +287,6 @@ func TestOAuthValidateToken(t *testing.T) { Server: config.ServerConfig{ OAuth: config.OAuthConfig{ Enabled: true, - Mode: "forward", Issuer: provider.server.URL, JWKSURL: provider.server.URL + "/jwks", Audience: "clickhouse-api", @@ -327,7 +326,6 @@ func TestOAuthValidateToken(t *testing.T) { Server: config.ServerConfig{ OAuth: config.OAuthConfig{ Enabled: true, - Mode: "forward", Issuer: provider.server.URL, JWKSURL: provider.server.URL + "/jwks", Audience: "clickhouse-api", @@ -354,7 +352,6 @@ func TestOAuthValidateToken(t *testing.T) { Server: config.ServerConfig{ OAuth: config.OAuthConfig{ Enabled: true, - Mode: "forward", Issuer: provider.server.URL, JWKSURL: provider.server.URL + "/jwks", Audience: "clickhouse-api", @@ -375,10 +372,10 @@ func TestOAuthValidateToken(t *testing.T) { require.ErrorIs(t, err, ErrOAuthInsufficientScopes) }) - // C-1: forward-mode JWT with no JWKS source configured soft-passes — the + // C-1: JWT with no JWKS source configured soft-passes — the // MCP server cannot validate locally and defers to ClickHouse. Operators // who want strict local validation set Issuer or JWKSURL. - t.Run("forward_mode_jwt_without_jwks_source_softpasses", func(t *testing.T) { + t.Run("jwt_without_jwks_source_softpasses", func(t *testing.T) { t.Parallel() provider := newTestOAuthProvider(t, nil) srv := &ClickHouseJWEServer{ @@ -386,7 +383,6 @@ func TestOAuthValidateToken(t *testing.T) { Server: config.ServerConfig{ OAuth: config.OAuthConfig{ Enabled: true, - Mode: "forward", }, }, }, @@ -402,10 +398,10 @@ func TestOAuthValidateToken(t *testing.T) { require.Nil(t, claims, "soft-pass returns nil claims to signal 'unvalidated, defer to ClickHouse'") }) - // C-1: opaque (non-JWT) bearers in forward mode soft-pass even when JWKS + // C-1: opaque (non-JWT) bearers soft-pass even when JWKS // IS configured — there is no way to validate them without RFC 7662 // introspection, which we do not implement. - t.Run("forward_mode_opaque_bearer_softpasses_with_jwks_configured", func(t *testing.T) { + t.Run("opaque_bearer_softpasses_with_jwks_configured", func(t *testing.T) { t.Parallel() provider := newTestOAuthProvider(t, nil) srv := &ClickHouseJWEServer{ @@ -413,7 +409,6 @@ func TestOAuthValidateToken(t *testing.T) { Server: config.ServerConfig{ OAuth: config.OAuthConfig{ Enabled: true, - Mode: "forward", Issuer: provider.server.URL, JWKSURL: provider.server.URL + "/jwks", }, @@ -425,22 +420,19 @@ func TestOAuthValidateToken(t *testing.T) { require.Nil(t, claims) }) - // Gating mode rejects opaque bearers outright (MCP is a pure resource - // server; only AS-issued JWTs are valid; opaque tokens are never forwarded - // to ClickHouse in gating mode, so soft-passing them would be a silent - // auth bypass). - t.Run("gating_mode_opaque_bearer_rejected", func(t *testing.T) { + // Strict JWT-only deployments reject opaque bearers outright. + t.Run("strict_jwt_only_opaque_bearer_rejected", func(t *testing.T) { t.Parallel() provider := newTestOAuthProvider(t, nil) srv := &ClickHouseJWEServer{ Config: config.Config{ Server: config.ServerConfig{ OAuth: config.OAuthConfig{ - Enabled: true, - Mode: "gating", - Issuer: provider.server.URL, - JWKSURL: provider.server.URL + "/jwks", - Audience: "https://mcp.example.com/", + Enabled: true, + Issuer: provider.server.URL, + JWKSURL: provider.server.URL + "/jwks", + Audience: "https://mcp.example.com/", + StrictJWTOnly: true, }, }, }, @@ -450,10 +442,10 @@ func TestOAuthValidateToken(t *testing.T) { require.Nil(t, claims) }) - // C-1: forward-mode JWT with JWKS configured AND a tampered signature is + // C-1: JWT with JWKS configured AND a tampered signature is // rejected at the MCP layer before reaching ClickHouse. Closes the // pre-fix gap where any string in `Authorization: Bearer …` was accepted. - t.Run("forward_mode_jwt_with_bad_signature_rejected", func(t *testing.T) { + t.Run("jwt_with_bad_signature_rejected", func(t *testing.T) { t.Parallel() provider := newTestOAuthProvider(t, nil) srv := &ClickHouseJWEServer{ @@ -461,7 +453,6 @@ func TestOAuthValidateToken(t *testing.T) { Server: config.ServerConfig{ OAuth: config.OAuthConfig{ Enabled: true, - Mode: "forward", Issuer: provider.server.URL, JWKSURL: provider.server.URL + "/jwks", }, @@ -485,11 +476,11 @@ func TestOAuthValidateToken(t *testing.T) { sigStart := dot2 + 1 tampered := token[:sigStart] + flipBase64URLChar(token[sigStart:sigStart+1]) + token[sigStart+1:] _, err := srv.ValidateOAuthToken(tampered) - require.Error(t, err, "tampered forward-mode JWT must be rejected (orig token: %q tampered: %q)", token, tampered) + require.Error(t, err, "tampered JWT must be rejected (orig token: %q tampered: %q)", token, tampered) }) - // C-1: forward-mode expired JWT is rejected at the MCP layer. - t.Run("forward_mode_expired_jwt_rejected", func(t *testing.T) { + // C-1: expired JWT is rejected at the MCP layer. + t.Run("expired_jwt_rejected", func(t *testing.T) { t.Parallel() provider := newTestOAuthProvider(t, nil) srv := &ClickHouseJWEServer{ @@ -497,7 +488,6 @@ func TestOAuthValidateToken(t *testing.T) { Server: config.ServerConfig{ OAuth: config.OAuthConfig{ Enabled: true, - Mode: "forward", Issuer: provider.server.URL, JWKSURL: provider.server.URL + "/jwks", Audience: "clickhouse-api", @@ -530,38 +520,23 @@ func flipBase64URLChar(c string) string { } // TestOAuthRequiresLocalValidation locks in the C-1 invariant: the auth layer -// MUST validate locally in both forward and gating modes. Pre-C-1 the gate -// returned true only for gating, leaving forward-mode tokens unchecked. +// MUST validate locally whenever OAuth is enabled. func TestOAuthRequiresLocalValidation(t *testing.T) { t.Parallel() - cases := []struct { - name string - mode string - want bool - }{ - {"gating_validates", "gating", true}, - {"forward_validates", "forward", true}, - {"empty_defaults_to_gating_validates", "", true}, - } - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - t.Parallel() - srv := &ClickHouseJWEServer{ - Config: config.Config{ - Server: config.ServerConfig{ - OAuth: config.OAuthConfig{Enabled: true, Mode: tc.mode}, - }, - }, - } - require.Equal(t, tc.want, srv.oauthRequiresLocalValidation()) - }) + srv := &ClickHouseJWEServer{ + Config: config.Config{ + Server: config.ServerConfig{ + OAuth: config.OAuthConfig{Enabled: true}, + }, + }, } + require.True(t, srv.oauthRequiresLocalValidation()) t.Run("disabled_skips_validation", func(t *testing.T) { t.Parallel() srv := &ClickHouseJWEServer{ Config: config.Config{ Server: config.ServerConfig{ - OAuth: config.OAuthConfig{Enabled: false, Mode: "forward"}, + OAuth: config.OAuthConfig{Enabled: false}, }, }, } @@ -581,7 +556,6 @@ func TestOAuthIssuerEnforcement(t *testing.T) { Server: config.ServerConfig{ OAuth: config.OAuthConfig{ Enabled: true, - Mode: "forward", Issuer: provider.server.URL, JWKSURL: provider.server.URL + "/jwks", Audience: "clickhouse-api", @@ -608,7 +582,6 @@ func TestOAuthIssuerEnforcement(t *testing.T) { Server: config.ServerConfig{ OAuth: config.OAuthConfig{ Enabled: true, - Mode: "forward", Issuer: "https://only-this-one.example.com", JWKSURL: provider.server.URL + "/jwks", Audience: "clickhouse-api", @@ -634,7 +607,6 @@ func TestOAuthIssuerEnforcement(t *testing.T) { Server: config.ServerConfig{ OAuth: config.OAuthConfig{ Enabled: true, - Mode: "forward", Issuer: provider.server.URL + "/", JWKSURL: provider.server.URL + "/jwks", Audience: "clickhouse-api", @@ -654,20 +626,12 @@ func TestOAuthIssuerEnforcement(t *testing.T) { }) } -// TestOAuthBuildClickHouseHeaders tests the forward-mode header builder. Gating -// mode no longer goes through this helper — its CH credentials are conveyed -// via the clickhouse-go Auth.Username/Auth.Password Basic header. +// TestOAuthBuildClickHouseHeaders tests the bearer header builder. func TestOAuthBuildClickHouseHeaders(t *testing.T) { t.Parallel() - t.Run("gating_returns_no_headers", func(t *testing.T) { + t.Run("wraps_token_as_bearer", func(t *testing.T) { t.Parallel() - cfg := config.OAuthConfig{Mode: "gating"} - require.Nil(t, oauth.BuildClickHouseHeaders(cfg, "token")) - }) - - t.Run("forward_wraps_token_as_bearer", func(t *testing.T) { - t.Parallel() - cfg := config.OAuthConfig{Mode: "forward"} + cfg := config.OAuthConfig{} headers := oauth.BuildClickHouseHeaders(cfg, "my-access-token") require.NotNil(t, headers) require.Equal(t, "Bearer my-access-token", headers["Authorization"]) @@ -708,7 +672,6 @@ func TestOAuthAndJWECombined(t *testing.T) { }, OAuth: config.OAuthConfig{ Enabled: true, - Mode: "forward", Issuer: provider.server.URL, JWKSURL: provider.server.URL + "/jwks", }, @@ -747,7 +710,6 @@ func TestOAuthAndJWECombined(t *testing.T) { }, OAuth: config.OAuthConfig{ Enabled: true, - Mode: "forward", Issuer: provider.server.URL, JWKSURL: provider.server.URL + "/jwks", }, @@ -796,7 +758,6 @@ func TestOAuthAndJWECombined(t *testing.T) { }, OAuth: config.OAuthConfig{ Enabled: true, - Mode: "forward", Issuer: provider.server.URL, JWKSURL: provider.server.URL + "/jwks", }, @@ -873,11 +834,10 @@ func TestOAuthAndJWECombined(t *testing.T) { }, OAuth: config.OAuthConfig{ Enabled: true, - Mode: "gating", Issuer: provider.server.URL, JWKSURL: provider.server.URL + "/jwks", Audience: "https://mcp.example.com", - SigningSecret: "test-gating-secret-32-byte-key!!", + SigningSecret: "test-signing-secret-32-byte-key!!", }, }, }, "test") @@ -909,7 +869,6 @@ func TestOAuthAndJWECombined(t *testing.T) { }, OAuth: config.OAuthConfig{ Enabled: true, - Mode: "forward", Issuer: provider.server.URL, JWKSURL: provider.server.URL + "/jwks", }, @@ -933,27 +892,26 @@ func TestOAuthOpenAPIHandler(t *testing.T) { chConfig := setupEmbeddedClickHouse(t) provider := newTestOAuthProvider(t, nil) - t.Run("oauth_gating_routes_bearer_through_basic_to_ch", func(t *testing.T) { + t.Run("oauth_routes_bearer_through_basic_to_ch", func(t *testing.T) { t.Parallel() // Gating mode rewrites Auth to Basic email:JWT for the CH-side // ch-jwt-verify sidecar. The embedded CH has no http_authentication // configured, so the request fails — but the failure is at the CH // layer, not at MCP. Assert non-401 to confirm MCP forwarded the // bearer. - const gatingSecret = "test-gating-secret-32-byte-key!!" + const signingSecret = "test-signing-secret-32-byte-key!!" srv := NewClickHouseMCPServer(config.Config{ ClickHouse: *chConfig, Server: config.ServerConfig{ JWE: config.JWEConfig{Enabled: false}, OAuth: config.OAuthConfig{ Enabled: true, - Mode: "gating", - SigningSecret: gatingSecret, + SigningSecret: signingSecret, }, }, }, "test") - oauthToken := mintSelfIssuedToken(t, gatingSecret, map[string]interface{}{ + oauthToken := mintSelfIssuedToken(t, signingSecret, map[string]interface{}{ "sub": "user123", "email": "user123@example.com", "exp": time.Now().Add(time.Hour).Unix(), @@ -967,8 +925,8 @@ func TestOAuthOpenAPIHandler(t *testing.T) { srv.OpenAPIHandler(rr, req) // Embedded CH rejects unknown user → 500 from the client builder. - // MCP itself does not validate, so 401 would indicate a regression - // in the pure-forwarder contract. + // MCP itself does not validate, so 401 would indicate a regression in + // the per-request bearer contract. require.NotEqual(t, http.StatusUnauthorized, rr.Code, rr.Body.String()) }) @@ -980,7 +938,6 @@ func TestOAuthOpenAPIHandler(t *testing.T) { JWE: config.JWEConfig{Enabled: false}, OAuth: config.OAuthConfig{ Enabled: true, - Mode: "forward", Issuer: provider.server.URL, JWKSURL: provider.server.URL + "/jwks", }, @@ -1005,7 +962,6 @@ func TestOAuthOpenAPIHandler(t *testing.T) { JWE: config.JWEConfig{Enabled: false}, OAuth: config.OAuthConfig{ Enabled: true, - Mode: "forward", Issuer: provider.server.URL, JWKSURL: provider.server.URL + "/jwks", }, @@ -1022,7 +978,7 @@ func TestOAuthOpenAPIHandler(t *testing.T) { srv.OpenAPIHandler(rr, req) require.Equal(t, http.StatusInternalServerError, rr.Code, - "forward mode should pass token to CH; standard CH rejects Bearer auth") + "OAuth should pass token to CH; standard CH rejects Bearer auth") require.Contains(t, rr.Body.String(), "Failed to get ClickHouse client", "response should indicate CH connection failure, not MCP rejection") }) @@ -1035,7 +991,6 @@ func TestOAuthOpenAPIHandler(t *testing.T) { JWE: config.JWEConfig{Enabled: false}, OAuth: config.OAuthConfig{ Enabled: true, - Mode: "forward", Issuer: provider.server.URL, JWKSURL: provider.server.URL + "/jwks", Audience: "clickhouse-api", @@ -1054,7 +1009,7 @@ func TestOAuthOpenAPIHandler(t *testing.T) { srv.OpenAPIHandler(rr, req) require.Equal(t, http.StatusInternalServerError, rr.Code, - "forward mode should pass token to CH; standard CH rejects Bearer auth") + "OAuth should pass token to CH; standard CH rejects Bearer auth") require.Contains(t, rr.Body.String(), "Failed to get ClickHouse client", "response should indicate CH connection failure, not MCP rejection") }) @@ -1067,7 +1022,6 @@ func TestOAuthOpenAPIHandler(t *testing.T) { JWE: config.JWEConfig{Enabled: false}, OAuth: config.OAuthConfig{ Enabled: true, - Mode: "forward", Issuer: provider.server.URL, JWKSURL: provider.server.URL + "/jwks", }, @@ -1084,7 +1038,7 @@ func TestOAuthOpenAPIHandler(t *testing.T) { srv.OpenAPIHandler(rr, req) require.Equal(t, http.StatusInternalServerError, rr.Code, - "forward mode should pass token to CH; standard CH rejects Bearer auth") + "OAuth should pass token to CH; standard CH rejects Bearer auth") require.Contains(t, rr.Body.String(), "Failed to get ClickHouse client", "response should indicate CH connection failure, not MCP rejection") }) @@ -1146,11 +1100,10 @@ func TestGetClickHouseClientWithOAuth(t *testing.T) { require.NoError(t, client.Close()) }) - t.Run("with_oauth_forwarding", func(t *testing.T) { + t.Run("with_oauth_bearer_header", func(t *testing.T) { t.Parallel() cfg := config.OAuthConfig{ Enabled: true, - Mode: "forward", } headers := oauth.BuildClickHouseHeaders(cfg, "oauth-token") require.NotNil(t, headers) @@ -1193,7 +1146,7 @@ func TestGetClickHouseClientWithOAuth(t *testing.T) { require.NoError(t, client.Close()) }) - t.Run("gating_mode_sets_basic_creds_from_jwt", func(t *testing.T) { + t.Run("oauth_sets_basic_creds_from_jwt", func(t *testing.T) { t.Parallel() // Gating mode unverified-decodes the JWT email claim and routes it as // `Authorization: Basic base64(email:JWT)` via clickhouse-go's @@ -1202,8 +1155,8 @@ func TestGetClickHouseClientWithOAuth(t *testing.T) { // gate); the embedded CH rejects an unknown user, so we expect a // connection failure, but the error should reference the email // (proving the wire-format switch fired). - const gatingSecret = "test-gating-secret-32-byte-key!!" - oauthToken := mintSelfIssuedToken(t, gatingSecret, map[string]interface{}{ + const signingSecret = "test-signing-secret-32-byte-key!!" + oauthToken := mintSelfIssuedToken(t, signingSecret, map[string]interface{}{ "sub": "u-alice", "email": "alice@example.com", "exp": time.Now().Add(time.Hour).Unix(), @@ -1213,8 +1166,7 @@ func TestGetClickHouseClientWithOAuth(t *testing.T) { Server: config.ServerConfig{ OAuth: config.OAuthConfig{ Enabled: true, - Mode: "gating", - SigningSecret: gatingSecret, + SigningSecret: signingSecret, }, }, }, "test") @@ -1225,15 +1177,14 @@ func TestGetClickHouseClientWithOAuth(t *testing.T) { require.Error(t, err) }) - t.Run("gating_mode_non_jwt_bearer_rejected", func(t *testing.T) { + t.Run("strict_jwt_only_non_jwt_bearer_rejected", func(t *testing.T) { t.Parallel() srv := NewClickHouseMCPServer(config.Config{ ClickHouse: *chConfig, Server: config.ServerConfig{ OAuth: config.OAuthConfig{ Enabled: true, - Mode: "gating", - SigningSecret: "test-gating-secret-32-byte-key!!", + StrictJWTOnly: true, }, }, }, "test") @@ -1279,7 +1230,7 @@ func TestValidateAuth(t *testing.T) { Config: config.Config{ Server: config.ServerConfig{ JWE: config.JWEConfig{Enabled: true, JWESecretKey: jweSecret, JWTSecretKey: jwtSecret}, - OAuth: config.OAuthConfig{Enabled: true, Mode: "forward"}, + OAuth: config.OAuthConfig{Enabled: true}, }, }, } @@ -1309,7 +1260,7 @@ func TestValidateAuth(t *testing.T) { Config: config.Config{ Server: config.ServerConfig{ JWE: config.JWEConfig{Enabled: true, JWESecretKey: jweSecret, JWTSecretKey: jwtSecret}, - OAuth: config.OAuthConfig{Enabled: true, Mode: "forward"}, + OAuth: config.OAuthConfig{Enabled: true}, }, }, } @@ -1338,7 +1289,7 @@ func TestValidateAuth(t *testing.T) { Config: config.Config{ Server: config.ServerConfig{ JWE: config.JWEConfig{Enabled: true, JWESecretKey: jweSecret, JWTSecretKey: jwtSecret}, - OAuth: config.OAuthConfig{Enabled: true, Mode: "forward"}, + OAuth: config.OAuthConfig{Enabled: true}, }, }, } @@ -1356,7 +1307,7 @@ func TestValidateAuth(t *testing.T) { Config: config.Config{ Server: config.ServerConfig{ JWE: config.JWEConfig{Enabled: true, JWESecretKey: "this-is-a-32-byte-secret-key!!", JWTSecretKey: "jwt"}, - OAuth: config.OAuthConfig{Enabled: true, Mode: "forward"}, + OAuth: config.OAuthConfig{Enabled: true}, }, }, } @@ -1377,7 +1328,7 @@ func TestValidateAuth(t *testing.T) { Config: config.Config{ Server: config.ServerConfig{ JWE: config.JWEConfig{Enabled: true, JWESecretKey: "this-is-a-32-byte-secret-key!!", JWTSecretKey: "jwt"}, - OAuth: config.OAuthConfig{Enabled: true, Mode: "forward"}, + OAuth: config.OAuthConfig{Enabled: true}, }, }, } @@ -1403,7 +1354,7 @@ func TestValidateAuth(t *testing.T) { Config: config.Config{ Server: config.ServerConfig{ JWE: config.JWEConfig{Enabled: true, JWESecretKey: jweSecret, JWTSecretKey: jwtSecret}, - OAuth: config.OAuthConfig{Enabled: true, Mode: "forward"}, + OAuth: config.OAuthConfig{Enabled: true}, }, }, } @@ -1464,11 +1415,10 @@ func TestOAuthMCPToolExecution(t *testing.T) { require.Equal(t, 1, qr.Count) }) - t.Run("execute_query_with_oauth_and_header_forwarding", func(t *testing.T) { + t.Run("execute_query_with_oauth_bearer_header", func(t *testing.T) { t.Parallel() cfg := config.OAuthConfig{ Enabled: true, - Mode: "forward", } oauthToken := "opaque-access-token" headers := oauth.BuildClickHouseHeaders(cfg, oauthToken) @@ -1505,7 +1455,6 @@ func TestOAuthMCPToolExecution(t *testing.T) { }, OAuth: config.OAuthConfig{ Enabled: true, - Mode: "forward", Issuer: provider.server.URL, JWKSURL: provider.server.URL + "/jwks", }, @@ -1544,7 +1493,7 @@ func TestOAuthOpenAPIFullFlow(t *testing.T) { antalyaCH := setupEmbeddedAntalyaWithOIDC(t, oidcProvider.server.URL) srv := NewClickHouseMCPServer(config.Config{ ClickHouse: antalyaCH, - Server: config.ServerConfig{OAuth: config.OAuthConfig{Enabled: true, Mode: "forward"}}, + Server: config.ServerConfig{OAuth: config.OAuthConfig{Enabled: true}}, }, "test") oauthToken := oidcProvider.issueJWT(t, map[string]interface{}{ "sub": "service-account-123", @@ -1563,7 +1512,7 @@ func TestOAuthOpenAPIFullFlow(t *testing.T) { require.Contains(t, qr.Columns, "version") }) - t.Run("forward_mode_passthrough_wrong_audience", func(t *testing.T) { + t.Run("oauth_bearer_passthrough_wrong_audience", func(t *testing.T) { t.Parallel() srv := NewClickHouseMCPServer(config.Config{ ClickHouse: *chConfig, @@ -1571,7 +1520,6 @@ func TestOAuthOpenAPIFullFlow(t *testing.T) { JWE: config.JWEConfig{Enabled: false}, OAuth: config.OAuthConfig{ Enabled: true, - Mode: "forward", Issuer: provider.server.URL, JWKSURL: provider.server.URL + "/jwks", Audience: "expected-audience", @@ -1589,12 +1537,12 @@ func TestOAuthOpenAPIFullFlow(t *testing.T) { srv.OpenAPIHandler(rr, req) require.Equal(t, http.StatusInternalServerError, rr.Code, - "forward mode should pass token to CH; standard CH rejects Bearer auth") + "OAuth should pass token to CH; standard CH rejects Bearer auth") require.Contains(t, rr.Body.String(), "Failed to get ClickHouse client", "response should indicate CH connection failure, not MCP rejection") }) - t.Run("forward_mode_passthrough_missing_scope", func(t *testing.T) { + t.Run("oauth_bearer_passthrough_missing_scope", func(t *testing.T) { t.Parallel() srv := NewClickHouseMCPServer(config.Config{ ClickHouse: *chConfig, @@ -1602,7 +1550,6 @@ func TestOAuthOpenAPIFullFlow(t *testing.T) { JWE: config.JWEConfig{Enabled: false}, OAuth: config.OAuthConfig{ Enabled: true, - Mode: "forward", Issuer: provider.server.URL, JWKSURL: provider.server.URL + "/jwks", Audience: "clickhouse-api", @@ -1621,7 +1568,7 @@ func TestOAuthOpenAPIFullFlow(t *testing.T) { srv.OpenAPIHandler(rr, req) require.Equal(t, http.StatusInternalServerError, rr.Code, - "forward mode should pass token to CH; standard CH rejects Bearer auth") + "OAuth should pass token to CH; standard CH rejects Bearer auth") require.Contains(t, rr.Body.String(), "Failed to get ClickHouse client", "response should indicate CH connection failure, not MCP rejection") }) diff --git a/pkg/server/server_client.go b/pkg/server/server_client.go index 92a8111b..9b441a40 100644 --- a/pkg/server/server_client.go +++ b/pkg/server/server_client.go @@ -50,10 +50,19 @@ func (s *ClickHouseJWEServer) GetClickHouseClient(ctx context.Context, tokenPara return client, nil } -// buildConfigFromClaims builds a ClickHouse config from JWE claims +// buildConfigFromClaims builds a ClickHouse config from JWE claims. +// In multi-cluster mode the caller passes a base config whose Host has +// already been template-expanded for the active cluster; we do not reach +// for s.Config.ClickHouse so the per-request cluster routing is preserved. func (s *ClickHouseJWEServer) buildConfigFromClaims(claims map[string]interface{}) (config.ClickHouseConfig, error) { - // Create a new ClickHouse config from the claims - chConfig := s.Config.ClickHouse // Use default as base + return s.buildConfigFromClaimsWithBase(s.Config.ClickHouse, claims) +} + +// buildConfigFromClaimsWithBase is the explicit-base variant used by the +// multi-cluster path. Same body as buildConfigFromClaims but takes the base +// chCfg as a parameter so the global is never consulted on the hot path. +func (s *ClickHouseJWEServer) buildConfigFromClaimsWithBase(base config.ClickHouseConfig, claims map[string]interface{}) (config.ClickHouseConfig, error) { + chConfig := base // copy if host, ok := claims["host"].(string); ok && host != "" { chConfig.Host = host @@ -98,12 +107,16 @@ func (s *ClickHouseJWEServer) buildConfigFromClaims(claims map[string]interface{ return chConfig, nil } -// GetClickHouseClientFromCtx creates a ClickHouse client using JWE and/or OAuth tokens from context +// GetClickHouseClientFromCtx creates a ClickHouse client using JWE and/or +// OAuth tokens from context. When the multi-cluster router has injected a +// per-request ClickHouseConfig (host templated for the active cluster), it +// is used; otherwise s.Config.ClickHouse is the base. func (s *ClickHouseJWEServer) GetClickHouseClientFromCtx(ctx context.Context) (*clickhouse.Client, error) { jweToken := s.ExtractTokenFromCtx(ctx) oauthToken := s.ExtractOAuthTokenFromCtx(ctx) oauthClaims := s.GetOAuthClaimsFromCtx(ctx) - return s.GetClickHouseClientWithOAuth(ctx, jweToken, oauthToken, oauthClaims) + chCfg := CHConfigFromContext(ctx, s.Config.ClickHouse) + return s.GetClickHouseClientWithOAuthForConfig(ctx, chCfg, jweToken, oauthToken, oauthClaims) } // GetJWEClaimsFromCtx extracts parsed JWE claims from context. @@ -158,9 +171,8 @@ func (s *ClickHouseJWEServer) ValidateAuth(r *http.Request) (jweToken string, jw } } - // Fall through to OAuth. MCP is a pure forwarder for queries: the - // ch-jwt-verify sidecar cryptographically validates the JWT at each - // ClickHouse query, so MCP does not re-validate here. + // Fall through to OAuth. ClickHouse or its sidecar cryptographically + // validates the JWT at each query, so MCP does not re-validate here. if oauthEnabled { oauthToken = s.ExtractOAuthTokenFromRequest(r) if oauthToken == "" { @@ -188,9 +200,22 @@ func (s *ClickHouseJWEServer) openAPIPathPrefixes() []string { return []string{""} } -// GetClickHouseClientWithOAuth creates a ClickHouse client, optionally forwarding OAuth headers +// GetClickHouseClientWithOAuth creates a ClickHouse client using OAuth bearer +// credentials when present. Uses the global s.Config.ClickHouse as the base. +// The multi-cluster path calls GetClickHouseClientWithOAuthForConfig instead, +// threading a per-request chCfg through so per-cluster host templating is +// preserved. func (s *ClickHouseJWEServer) GetClickHouseClientWithOAuth(ctx context.Context, jweToken string, oauthToken string, oauthClaims *OAuthClaims) (*clickhouse.Client, error) { - // Build base config + return s.GetClickHouseClientWithOAuthForConfig(ctx, s.Config.ClickHouse, jweToken, oauthToken, oauthClaims) +} + +// GetClickHouseClientWithOAuthForConfig is the chCfg-explicit variant of +// GetClickHouseClientWithOAuth. The caller is responsible for supplying a +// ClickHouseConfig whose Host has already been template-expanded for the +// active cluster. The global s.Config.ClickHouse is not consulted on a +// per-request hot path under this entry point — single-cluster callers +// route here too via the no-arg shim above. +func (s *ClickHouseJWEServer) GetClickHouseClientWithOAuthForConfig(ctx context.Context, chCfg config.ClickHouseConfig, jweToken string, oauthToken string, oauthClaims *OAuthClaims) (*clickhouse.Client, error) { var chConfig config.ClickHouseConfig var err error @@ -203,66 +228,131 @@ func (s *ClickHouseJWEServer) GetClickHouseClientWithOAuth(ctx context.Context, return nil, fmt.Errorf("failed to parse JWE token: %w", err) } } - chConfig, err = s.buildConfigFromClaims(claims) + chConfig, err = s.buildConfigFromClaimsWithBase(chCfg, claims) if err != nil { return nil, err } } else { - chConfig = s.Config.ClickHouse - } - - // Switch on OAuth mode to pick the CH wire format. Forward and gating - // both apply only when an OAuth bearer is on the inbound request. - if s.Config.Server.OAuth.Enabled && oauthToken != "" { - switch { - case s.Config.Server.OAuth.IsForwardMode(): - // Forward mode: rewrite the Authorization header so ClickHouse - // receives `Bearer ` directly. Antalya's token_processors - // validates the bearer cryptographically. - oauthHeaders := oauth.BuildClickHouseHeaders(s.Config.Server.OAuth, oauthToken) - if len(oauthHeaders) > 0 { - if chConfig.HttpHeaders == nil { - chConfig.HttpHeaders = make(map[string]string) - } - for k, v := range oauthHeaders { - chConfig.HttpHeaders[k] = v - } - } - chConfig.Username = "" - chConfig.Password = "" - case s.Config.Server.OAuth.IsGatingMode(): - // Gating mode: ClickHouse's calls the - // colocated ch-jwt-verify sidecar over loopback to validate the - // JWT. The CH driver assembles `Authorization: Basic - // base64(email:JWT)` from Username/Password. The email is - // unverified-decoded from the JWT here; the sidecar enforces - // signature/iss/aud/exp/scope and the user-vs-email match. - email, ok := emailFromUnverifiedJWT(oauthToken) - if !ok { - return nil, fmt.Errorf("oauth gating: bearer is not a JWT with an email claim") - } - chConfig.Username = email - chConfig.Password = oauthToken - // http_authentication is HTTP-only on the CH side. Force the - // driver to use HTTP regardless of static config. - chConfig.Protocol = config.HTTPProtocol - } + chConfig = chCfg } - // Merge tool-input settings from context (tool_input_settings) + // Merge tool-input settings before OAuth so probe configs carry them. if toolSettings := ToolInputSettingsFromContext(ctx); len(toolSettings) > 0 { chConfig = mergeExtraSettings(chConfig, toolSettings) } - // Create client + if s.Config.Server.OAuth.Enabled && oauthToken != "" { + return s.newClientWithOAuth(ctx, chConfig, oauthToken) + } + client, err := clickhouse.NewClient(ctx, chConfig) if err != nil { return nil, fmt.Errorf("failed to create ClickHouse client: %w", err) } + return client, nil +} + +// chOAuthMethod is the wire format used to authenticate against ClickHouse. +type chOAuthMethod int8 + +const ( + chOAuthMethodBearer chOAuthMethod = 1 // Authorization: Bearer + chOAuthMethodBasic chOAuthMethod = 2 // Authorization: Basic base64(email:token) +) + +// newClientWithOAuth resolves the CH auth method for the endpoint (from cache +// or by probing), then creates and returns a connected ClickHouse client. +// +// The method is auto-detected by trying Bearer first; on CH auth error it +// falls back to Basic and caches the result so subsequent requests skip the +// probe. +func (s *ClickHouseJWEServer) newClientWithOAuth(ctx context.Context, chCfg config.ClickHouseConfig, token string) (*clickhouse.Client, error) { + cfg := s.Config.Server.OAuth + endpoint := fmt.Sprintf("%s:%d", chCfg.Host, chCfg.Port) + + // Cache hit — use the stored method. + if v, ok := s.chOAuthMethodCache.Load(endpoint); ok { + return newClientForOAuthMethod(ctx, chCfg, token, v.(chOAuthMethod), cfg) + } + + // Auto-detect. Try Bearer first. NewClient pings internally, so a failed + // ping surfaces as an auth error here if CH rejects the token. + bearerCfg := oauthApplyBearer(chCfg, token, cfg) + if client, err := clickhouse.NewClient(ctx, bearerCfg); err == nil { + s.chOAuthMethodCache.Store(endpoint, chOAuthMethodBearer) + return client, nil // return probe client directly — no second ping + } else if !isChAuthError(err) { + return nil, fmt.Errorf("failed to create ClickHouse client: %w", err) + } + + // Bearer got an auth error — try Basic (ch-jwt-verify sidecar path). + log.Debug().Str("endpoint", endpoint).Msg("oauth: Bearer rejected by CH, probing Basic") + email, ok := emailFromUnverifiedJWT(token) + if !ok { + return nil, fmt.Errorf("oauth: bearer is not a JWT with an email claim") + } + basicCfg := oauthApplyBasic(chCfg, email, token) + client, err := clickhouse.NewClient(ctx, basicCfg) + if err != nil { + return nil, fmt.Errorf("failed to create ClickHouse client: %w", err) + } + s.chOAuthMethodCache.Store(endpoint, chOAuthMethodBasic) + return client, nil +} +// newClientForOAuthMethod applies method to chCfg and creates a CH client. +func newClientForOAuthMethod(ctx context.Context, chCfg config.ClickHouseConfig, token string, method chOAuthMethod, oauthCfg oauth.OAuthConfig) (*clickhouse.Client, error) { + switch method { + case chOAuthMethodBearer: + chCfg = oauthApplyBearer(chCfg, token, oauthCfg) + case chOAuthMethodBasic: + email, ok := emailFromUnverifiedJWT(token) + if !ok { + return nil, fmt.Errorf("oauth: bearer is not a JWT with an email claim") + } + chCfg = oauthApplyBasic(chCfg, email, token) + } + client, err := clickhouse.NewClient(ctx, chCfg) + if err != nil { + return nil, fmt.Errorf("failed to create ClickHouse client: %w", err) + } return client, nil } +// oauthApplyBearer returns a copy of chCfg with an Authorization: Bearer header set. +func oauthApplyBearer(chCfg config.ClickHouseConfig, token string, oauthCfg oauth.OAuthConfig) config.ClickHouseConfig { + headers := oauth.BuildClickHouseHeaders(oauthCfg, token) + if len(headers) > 0 { + if chCfg.HttpHeaders == nil { + chCfg.HttpHeaders = make(map[string]string) + } + for k, v := range headers { + chCfg.HttpHeaders[k] = v + } + } + chCfg.Username = "" + chCfg.Password = "" + return chCfg +} + +// oauthApplyBasic returns a copy of chCfg with Basic auth credentials set. +// CH's http_authentication extension expects Basic base64(email:JWT) and +// delegates validation to the ch-jwt-verify sidecar over loopback. +func oauthApplyBasic(chCfg config.ClickHouseConfig, email, token string) config.ClickHouseConfig { + chCfg.Username = email + chCfg.Password = token + chCfg.Protocol = config.HTTPProtocol + return chCfg +} + +// isChAuthError reports whether err is a CH authentication failure. +// Delegates to ClassifyDiscoveryError (multicluster_identity.go) which covers +// HTTP 401/403, CH exception codes 497/516/519, and common error strings. +func isChAuthError(err error) bool { + auth, _ := ClassifyDiscoveryError(err) + return auth +} + // emailFromUnverifiedJWT decodes the JWT payload without verifying the // signature and returns the `email` claim (or first namespaced `*/email` // fallback). Used only to populate the CH `Basic` username so the sidecar can @@ -277,13 +367,13 @@ func emailFromUnverifiedJWT(token string) (string, bool) { if err != nil { // Some IdPs emit padded segments; try the std encoding as a fallback. if payload, err = base64.URLEncoding.DecodeString(parts[1]); err != nil { - log.Debug().Err(err).Msg("oauth gating: failed to base64-decode JWT payload") + log.Debug().Err(err).Msg("oauth: failed to base64-decode JWT payload") return "", false } } var raw map[string]interface{} if err := json.Unmarshal(payload, &raw); err != nil { - log.Debug().Err(err).Msg("oauth gating: failed to JSON-parse JWT payload") + log.Debug().Err(err).Msg("oauth: failed to JSON-parse JWT payload") return "", false } if e, ok := raw["email"].(string); ok { diff --git a/pkg/server/server_dynamic_tools.go b/pkg/server/server_dynamic_tools.go index 4d7ef4c5..0ecb7dd7 100644 --- a/pkg/server/server_dynamic_tools.go +++ b/pkg/server/server_dynamic_tools.go @@ -53,7 +53,7 @@ type dynamicToolCommentAnnotations struct { // every request: the fast path short-circuits once init completes. // // Discovery is deferred until the caller has usable credentials. In OAuth -// forward mode the Bearer token only arrives on tools/call, not tools/list — +// OAuth the Bearer token only arrives on tools/call, not tools/list — // so the first tools/list just returns static tools, and the first authenticated // tools/call triggers discovery. The MCP SDK's AddTool automatically fires // notifications/tools/list_changed, prompting the client to re-fetch. @@ -157,8 +157,27 @@ func (s *ClickHouseJWEServer) discoverReadTools(ctx context.Context) (map[string if len(readRules) == 0 { return map[string]dynamicToolMeta{}, nil } + factory := func(ctx context.Context, _ config.ClickHouseConfig) (*clickhouse.Client, error) { + return s.getDiscoveryClient(ctx) + } + return discoverReadToolsWith(ctx, s.Config.ClickHouse, factory, readRules) +} + +// discoverReadToolsWith is the factory-based variant of discoverReadTools. +// chCfg is authoritative for host/port/TLS; factory is responsible for +// credential extraction (JWE/OAuth in single-cluster mode; OAuth-only in +// multi-cluster mode). +func discoverReadToolsWith( + ctx context.Context, + chCfg config.ClickHouseConfig, + factory func(ctx context.Context, chCfg config.ClickHouseConfig) (*clickhouse.Client, error), + readRules []config.DynamicToolRule, +) (map[string]dynamicToolMeta, error) { + if len(readRules) == 0 { + return map[string]dynamicToolMeta{}, nil + } - chClient, err := s.getDiscoveryClient(ctx) + chClient, err := factory(ctx, chCfg) if err != nil { return nil, fmt.Errorf("dynamic_tools: failed to get ClickHouse client: %w", err) } @@ -266,8 +285,25 @@ func (s *ClickHouseJWEServer) discoverWriteTools(ctx context.Context) (map[strin if len(writeRules) == 0 { return map[string]dynamicToolMeta{}, nil } + factory := func(ctx context.Context, _ config.ClickHouseConfig) (*clickhouse.Client, error) { + return s.getDiscoveryClient(ctx) + } + return discoverWriteToolsWith(ctx, s.Config.ClickHouse, factory, writeRules) +} + +// discoverWriteToolsWith is the factory-based variant of discoverWriteTools. +// Caller is responsible for filtering rules and checking read-only mode. +func discoverWriteToolsWith( + ctx context.Context, + chCfg config.ClickHouseConfig, + factory func(ctx context.Context, chCfg config.ClickHouseConfig) (*clickhouse.Client, error), + writeRules []config.DynamicToolRule, +) (map[string]dynamicToolMeta, error) { + if len(writeRules) == 0 { + return map[string]dynamicToolMeta{}, nil + } - chClient, err := s.getDiscoveryClient(ctx) + chClient, err := factory(ctx, chCfg) if err != nil { return nil, fmt.Errorf("dynamic_tools: failed to get ClickHouse client: %w", err) } @@ -329,7 +365,7 @@ func (s *ClickHouseJWEServer) discoverWriteTools(ctx context.Context) (map[strin } rc := rules[matched[0]] - cols, colErr := s.getTableColumnsForMode(ctx, chClient, db, name) + cols, colErr := getTableColumnsForMode(ctx, chClient, db, name) if colErr != nil { log.Warn().Err(colErr).Str("table", full).Msg("dynamic_tools: failed to get columns for write tool, skipping") continue @@ -373,7 +409,7 @@ func (s *ClickHouseJWEServer) discoverWriteTools(ctx context.Context) (map[strin // ClickHouse versions. Some older versions (e.g., 26.1.x Altinity Antalya) // do not expose a `column_type` column, so we rely on `default_kind` alone // which carries the same information for our purposes. -func (s *ClickHouseJWEServer) getTableColumnsForMode(ctx context.Context, chClient *clickhouse.Client, db, table string) ([]dynamicToolParam, error) { +func getTableColumnsForMode(ctx context.Context, chClient *clickhouse.Client, db, table string) ([]dynamicToolParam, error) { q := fmt.Sprintf( "SELECT name, type, default_kind, comment FROM system.columns WHERE database='%s' AND table='%s' ORDER BY position", db, table, @@ -446,9 +482,52 @@ func buildWriteToolDescription(comment, db, table, mode string) string { // registerDynamicTools commits discovered read and write tools to the MCP server. // AddTool automatically fires notifications/tools/list_changed so clients refresh. +// +// In multi-cluster mode the equivalent path is RegisterDynamicToolsOn, called +// against a per-(bearer,cluster) *mcp.Server adapter — *not* via this method, +// because s.dynamicTools is global to the single-cluster ClickHouseJWEServer +// and would poison cross-tenant if shared. func (s *ClickHouseJWEServer) registerDynamicTools(readTools, writeTools map[string]dynamicToolMeta) { - for toolName, meta := range readTools { - s.dynamicTools[toolName] = meta + merged := make(map[string]dynamicToolMeta, len(readTools)+len(writeTools)) + for k, v := range readTools { + s.dynamicTools[k] = v + merged[k] = v + } + for k, v := range writeTools { + s.dynamicTools[k] = v + merged[k] = v + } + writeHandlerFactory := s.makeDynamicWriteToolHandler + registerDynamicToolsOn(s, merged, s.Config.Server.ToolInputSettings, writeHandlerFactory) + log.Info(). + Int("read_tools", len(readTools)). + Int("write_tools", len(writeTools)). + Msg("Dynamic tools registered") +} + +// RegisterDynamicToolsOn registers the discovered dynamic tools (both read +// and write) on srv. Pure helper — no mutation of any ClickHouseJWEServer +// state. Used by multi-cluster mode: each (bearer, cluster) request builds +// a fresh *mcp.Server and populates it via this function. +// +// For write tools, the handler delegates to the JWEServer-bound +// makeDynamicWriteToolHandler closure when one is provided (parent != nil); +// when parent is nil, write tools use a standalone handler that pulls the +// JWEServer from context (multi-cluster path injects it via serverInjector). +func RegisterDynamicToolsOn(srv AltinityMCPServer, tools map[string]dynamicToolMeta, cfg *config.Config) { + registerDynamicToolsOn(srv, tools, cfg.Server.ToolInputSettings, nil) +} + +// registerDynamicToolsOn is the internal implementation. writeHandlerFactory +// is non-nil in single-cluster mode (binds to *ClickHouseJWEServer); nil in +// multi-cluster mode (handler grabs the server from context). +func registerDynamicToolsOn( + srv AltinityMCPServer, + tools map[string]dynamicToolMeta, + toolInputSettings []string, + writeHandlerFactory func(meta dynamicToolMeta) ToolHandlerFunc, +) { + for toolName, meta := range tools { props := make(map[string]any, len(meta.Params)+1) required := make([]string, 0, len(meta.Params)) for _, p := range meta.Params { @@ -457,44 +536,85 @@ func (s *ClickHouseJWEServer) registerDynamicTools(readTools, writeTools map[str required = append(required, p.Name) } } - if settingsSchema := buildToolInputSettingsSchema(s.Config.Server.ToolInputSettings); settingsSchema != nil { + if settingsSchema := buildToolInputSettingsSchema(toolInputSettings); settingsSchema != nil { props["settings"] = settingsSchema } - s.AddTool(&mcp.Tool{ + tool := &mcp.Tool{ Name: toolName, Title: meta.Title, Description: meta.Description, Annotations: meta.Annotations, InputSchema: dynamicToolInputSchema(props, required), - }, makeDynamicToolHandler(meta)) + } + switch meta.ToolType { + case "write": + if writeHandlerFactory != nil { + srv.AddTool(tool, writeHandlerFactory(meta)) + } else { + srv.AddTool(tool, makeStandaloneDynamicWriteToolHandler(meta)) + } + default: + srv.AddTool(tool, makeDynamicToolHandler(meta)) + } } +} - for toolName, meta := range writeTools { - s.dynamicTools[toolName] = meta - props := make(map[string]any, len(meta.Params)+1) - required := make([]string, 0, len(meta.Params)) - for _, p := range meta.Params { - props[p.Name] = buildParamSchema(p) - if p.Required { - required = append(required, p.Name) - } +// DiscoverTools runs the same discovery queries as +// (*ClickHouseJWEServer).EnsureDynamicTools but accepts an explicit +// ClickHouseConfig and a client factory, so multi-cluster mode can target +// per-cluster hosts without reaching for any global state. Returns the +// merged read+write tools map. +func DiscoverTools( + ctx context.Context, + chCfg config.ClickHouseConfig, + factory func(ctx context.Context, chCfg config.ClickHouseConfig) (*clickhouse.Client, error), + rules []config.DynamicToolRule, + readOnly bool, +) (map[string]dynamicToolMeta, error) { + merged := make(map[string]dynamicToolMeta) + if len(rules) == 0 { + return merged, nil + } + + readRules := filterRulesByType(rules, "read") + if len(readRules) > 0 { + readTools, err := discoverReadToolsWith(ctx, chCfg, factory, readRules) + if err != nil { + return nil, err } - if settingsSchema := buildToolInputSettingsSchema(s.Config.Server.ToolInputSettings); settingsSchema != nil { - props["settings"] = settingsSchema + for k, v := range readTools { + merged[k] = v } - s.AddTool(&mcp.Tool{ - Name: toolName, - Title: meta.Title, - Description: meta.Description, - Annotations: meta.Annotations, - InputSchema: dynamicToolInputSchema(props, required), - }, s.makeDynamicWriteToolHandler(meta)) } - log.Info(). - Int("read_tools", len(readTools)). - Int("write_tools", len(writeTools)). - Msg("Dynamic tools registered") + if !readOnly { + writeRules := filterRulesByType(rules, "write") + if len(writeRules) > 0 { + writeTools, err := discoverWriteToolsWith(ctx, chCfg, factory, writeRules) + if err != nil { + return nil, err + } + for k, v := range writeTools { + merged[k] = v + } + } + } + + return merged, nil +} + +// makeStandaloneDynamicWriteToolHandler returns a write-tool handler that +// pulls the *ClickHouseJWEServer from ctx (via CHJWEServerKey) — used in +// multi-cluster mode where there is no single parent server bound at +// registration time. +func makeStandaloneDynamicWriteToolHandler(meta dynamicToolMeta) ToolHandlerFunc { + return func(ctx context.Context, req *mcp.CallToolRequest) (*mcp.CallToolResult, error) { + chJweServer := GetClickHouseJWEServerFromContext(ctx) + if chJweServer == nil { + return nil, fmt.Errorf("can't get JWEServer from context") + } + return chJweServer.makeDynamicWriteToolHandler(meta)(ctx, req) + } } func makeDynamicToolHandler(meta dynamicToolMeta) ToolHandlerFunc { diff --git a/pkg/server/server_test.go b/pkg/server/server_test.go index d0f7c8d0..3d677f31 100644 --- a/pkg/server/server_test.go +++ b/pkg/server/server_test.go @@ -152,11 +152,11 @@ func TestOpenAPIHandlers(t *testing.T) { t.Run("combined_auth_oauth_only_via_openapi", func(t *testing.T) { t.Parallel() - // Gating-mode OAuth + JWE both enabled: with a bearer-only request, - // MCP forwards the bearer via Basic email:JWT to ClickHouse, which - // rejects unknown users when no sidecar is configured. MCP itself - // is a pure forwarder, so the request reaches the CH layer (non-401). - const gatingSecret = "test-gating-secret-32-byte-key!!" + // OAuth + JWE both enabled: with a bearer-only request, MCP presents + // the bearer via Basic email:JWT to ClickHouse, which rejects unknown + // users when no sidecar is configured. The request should reach the + // CH layer (non-401). + const signingSecret = "test-signing-secret-32-byte-key!!" srv := NewClickHouseMCPServer(config.Config{ ClickHouse: *chConfig, Server: config.ServerConfig{ @@ -167,13 +167,12 @@ func TestOpenAPIHandlers(t *testing.T) { }, OAuth: config.OAuthConfig{ Enabled: true, - Mode: "gating", - SigningSecret: gatingSecret, + SigningSecret: signingSecret, }, }, }, "test") - oauthToken := mintSelfIssuedToken(t, gatingSecret, map[string]interface{}{ + oauthToken := mintSelfIssuedToken(t, signingSecret, map[string]interface{}{ "sub": "user123", "email": "user123@example.com", "exp": time.Now().Add(time.Hour).Unix(), @@ -432,7 +431,6 @@ func TestOpenAPI_SchemaIncludesCombinedAuthPaths(t *testing.T) { }, OAuth: config.OAuthConfig{ Enabled: true, - Mode: "forward", }, }, },