{
    "$schema": "https://json-schema.org/draft/2020-12/schema",
    "$id": "https://u22a8.ai/schemas/model.schema.json",
    "title": "Model Configuration",
    "description": "Schema for model.yaml — defines traits, data sources, training config, and optional discovery settings.",
    "type": "object",
    "required": ["traits"],
    "additionalProperties": false,
    "properties": {
        "description": {
            "type": "string",
            "description": "Short, model-level description shown on the catalog and the model page. One sentence is plenty. When omitted, the catalog falls back to listing trait names."
        },
        "traits": {
            "description": "Trait definitions. Either the string 'discover' (full auto-discovery — trait count comes from effort), or a list of trait objects with optional `- discover: N` entries to override the count.",
            "oneOf": [
                { "const": "discover" },
                {
                    "type": "array",
                    "minItems": 1,
                    "items": {
                        "oneOf": [
                            { "$ref": "#/$defs/trait" },
                            { "$ref": "#/$defs/discoverEntry" }
                        ]
                    }
                }
            ]
        },
        "sources": {
            "type": "array",
            "description": "Data sources providing training samples.",
            "items": { "$ref": "#/$defs/source" }
        },
        "training": { "$ref": "#/$defs/training" },
        "discovery": { "$ref": "#/$defs/discovery" },
        "effort": {
            "type": "string",
            "enum": ["low", "medium", "high", "xhigh", "max"],
            "default": "high",
            "description": "How much cost/time/quality to put into the build. Authors don't prescribe model ids or numeric budgets; the codebase decodes the level. See docs/effort.md and DR 031."
        },
        "additional_terms": {
            "type": "string",
            "description": "Free-form additional terms (markdown OK). Presence triggers a consent prompt and prominent display on the model page. Omit or leave null when default ToU applies."
        },
        "upstream_sources": {
            "type": "array",
            "description": "Informational provenance: upstream datasets, licenses, citations. Displayed as a Sources block on the model page. Does not itself trigger consent.",
            "items": { "$ref": "#/$defs/upstreamSource" }
        }
    },
    "$defs": {
        "upstreamSource": {
            "type": "object",
            "required": ["name"],
            "additionalProperties": false,
            "properties": {
                "name": {
                    "type": "string",
                    "description": "Human-readable dataset / source name."
                },
                "license": {
                    "type": "string",
                    "description": "Upstream license identifier (e.g., 'CC-BY-4.0', 'CC-BY-NC-4.0', 'proprietary')."
                },
                "url": {
                    "type": "string",
                    "description": "URL to the source's homepage or landing page."
                },
                "citation": {
                    "type": "string",
                    "description": "Attribution / citation text as required by the upstream license."
                },
                "notes": {
                    "type": "string",
                    "description": "Any extra context (e.g., access conditions, version, date retrieved)."
                }
            }
        },
        "trait": {
            "type": "object",
            "additionalProperties": false,
            "required": ["type", "key"],
            "properties": {
                "type": {
                    "type": "string",
                    "enum": ["spectrum", "topic", "claim", "outlier"],
                    "description": "Trait geometry type. See DR 028 for the taxonomy and (geometry, range, metric) triples."
                },
                "key": {
                    "type": "string",
                    "pattern": "^[a-z0-9][a-z0-9_-]*$",
                    "maxLength": 100,
                    "description": "Stable identifier for this trait. Used everywhere: DB, mappings, API responses."
                },
                "name": {
                    "type": "string",
                    "minLength": 1,
                    "description": "Human-readable display name. Defaults to titlecased key."
                },
                "description": { "type": "string", "default": "" },
                "positive_label": { "type": "string", "default": "positive" },
                "negative_label": { "type": "string", "default": "negative" },
                "aggregation": {
                    "type": "string",
                    "enum": ["mean", "max", "min", "none"],
                    "default": "mean"
                },
                "positive_paraphrases": {
                    "type": "array",
                    "items": { "type": "string", "minLength": 1 },
                    "description": "Authored paraphrases of the positive pole. Used by the axis enricher (DR 028, DR 030) to synthesize a sample source when no dataset is provided."
                },
                "negative_paraphrases": {
                    "type": "array",
                    "items": { "type": "string", "minLength": 1 },
                    "description": "Authored paraphrases of the negative pole. Companion to positive_paraphrases."
                },
                "topic_paraphrases": {
                    "type": "array",
                    "items": { "type": "string", "minLength": 1 },
                    "description": "Authored content samples that exemplify the topic. Used by the topic enricher (DR 028) to materialize a manifold cluster; the ambient pole is auto-attached as a noise source."
                },
                "claim_statement": {
                    "type": "string",
                    "description": "The claim under test (e.g., 'All swans are white.'). Authoring context for claim-type traits; downstream synthesis uses it when confirming/contradicting paraphrases are missing."
                },
                "confirming_paraphrases": {
                    "type": "array",
                    "items": { "type": "string", "minLength": 1 },
                    "description": "Authored content samples that affirm the claim. Phase 3a treats these as positive samples for axis math; Phase 3b future work uses them in the dual-projection geometry from DR 028."
                },
                "contradicting_paraphrases": {
                    "type": "array",
                    "items": { "type": "string", "minLength": 1 },
                    "description": "Authored content samples that contradict the claim. Companion to confirming_paraphrases."
                },
                "cohort_paraphrases": {
                    "type": "array",
                    "items": { "type": "string", "minLength": 1 },
                    "description": "Authored representative samples of the typical/in-cohort distribution. Used by the outlier enricher (DR 028) to materialize an inverted axis where the cohort is the negative pole and ambient noise is the positive (outlier-like) pole. Phase 4a; Phase 4b upgrades to true Mahalanobis distance."
                }
            }
        },
        "discoverEntry": {
            "type": "object",
            "required": ["discover"],
            "additionalProperties": false,
            "properties": {
                "discover": {
                    "type": "integer",
                    "minimum": 1,
                    "description": "Number of additional traits to discover from data."
                }
            }
        },
        "qualityValue": {
            "description": "Quality indicator — a quality label string or numeric value in [0, 1].",
            "oneOf": [
                {
                    "type": "string",
                    "enum": [
                        "positive",
                        "negative",
                        "good",
                        "yes",
                        "high",
                        "strong",
                        "excellent",
                        "fair",
                        "moderate",
                        "mixed",
                        "medium",
                        "poor",
                        "no",
                        "low",
                        "weak",
                        "bad"
                    ]
                },
                {
                    "type": "number",
                    "minimum": 0,
                    "maximum": 1
                }
            ]
        },
        "jsonlMapping": {
            "type": "object",
            "description": "Maps JSONL field names to traits. Keys are field names in the JSONL file.",
            "minProperties": 1,
            "additionalProperties": {
                "type": "object",
                "required": ["trait"],
                "additionalProperties": false,
                "properties": {
                    "trait": {
                        "type": "string",
                        "description": "Name of the trait this field maps to. Must match a trait defined in the traits section."
                    }
                }
            }
        },
        "mappingValue": {
            "description": "Label mapping value — a quality string, or {trait, quality} object.",
            "oneOf": [
                { "$ref": "#/$defs/qualityValue" },
                {
                    "type": "object",
                    "required": ["trait"],
                    "additionalProperties": false,
                    "properties": {
                        "trait": { "type": "string" },
                        "quality": { "$ref": "#/$defs/qualityValue" }
                    }
                }
            ]
        },
        "source": {
            "type": "object",
            "required": ["type"],
            "properties": {
                "type": {
                    "type": "string",
                    "enum": [
                        "jsonl",
                        "csv",
                        "github_csv",
                        "url",
                        "file",
                        "huggingface",
                        "noise"
                    ]
                }
            },
            "allOf": [
                {
                    "if": { "properties": { "type": { "const": "jsonl" } } },
                    "then": {
                        "required": ["type", "path", "mapping"],
                        "additionalProperties": false,
                        "properties": {
                            "type": { "const": "jsonl" },
                            "path": {
                                "type": "string",
                                "description": "Path to JSONL file, relative to model directory."
                            },
                            "mapping": { "$ref": "#/$defs/jsonlMapping" },
                            "dropout": { "type": "boolean", "default": false }
                        }
                    }
                },
                {
                    "if": { "properties": { "type": { "const": "csv" } } },
                    "then": {
                        "required": ["type", "path"],
                        "additionalProperties": false,
                        "properties": {
                            "type": { "const": "csv" },
                            "path": { "type": "string" },
                            "trait": { "type": "string" },
                            "quality": { "$ref": "#/$defs/qualityValue" },
                            "text_column": { "type": "string" },
                            "label_column": { "type": "string" },
                            "mapping": {
                                "type": "object",
                                "additionalProperties": {
                                    "$ref": "#/$defs/mappingValue"
                                }
                            },
                            "columns": {
                                "type": "object",
                                "description": "Maps CSV column names to traits. Sources emit raw scores; the pipeline handles normalization.",
                                "minProperties": 1,
                                "additionalProperties": {
                                    "type": "object",
                                    "required": ["trait"],
                                    "additionalProperties": false,
                                    "properties": {
                                        "trait": {
                                            "type": "string",
                                            "description": "Trait key this column maps to."
                                        }
                                    }
                                }
                            },
                            "max_per_class": {
                                "type": "integer",
                                "minimum": 1,
                                "description": "Cap samples per trait per quality class."
                            },
                            "seed": {
                                "type": "integer",
                                "description": "Deterministic sampling seed."
                            },
                            "dropout": { "type": "boolean", "default": false }
                        }
                    }
                },
                {
                    "if": {
                        "properties": { "type": { "const": "github_csv" } }
                    },
                    "then": {
                        "required": ["type", "repo", "path", "columns"],
                        "additionalProperties": false,
                        "properties": {
                            "type": { "const": "github_csv" },
                            "repo": {
                                "type": "string",
                                "description": "GitHub repository (owner/name)."
                            },
                            "ref": {
                                "type": "string",
                                "description": "Git ref: branch, tag, or commit SHA. Defaults to main."
                            },
                            "path": {
                                "type": "string",
                                "description": "Path to CSV file within the repository."
                            },
                            "text_column": {
                                "type": "string",
                                "default": "text"
                            },
                            "max_samples": { "type": "integer", "minimum": 1 },
                            "columns": {
                                "type": "object",
                                "description": "Maps CSV column names to traits. Sources emit raw scores; the pipeline handles normalization.",
                                "minProperties": 1,
                                "additionalProperties": {
                                    "type": "object",
                                    "required": ["trait"],
                                    "additionalProperties": false,
                                    "properties": {
                                        "trait": {
                                            "type": "string",
                                            "description": "Trait key this column maps to."
                                        }
                                    }
                                }
                            },
                            "max_per_class": {
                                "type": "integer",
                                "minimum": 1,
                                "description": "Cap samples per trait per quality class."
                            },
                            "seed": {
                                "type": "integer",
                                "description": "Deterministic sampling seed."
                            },
                            "dropout": { "type": "boolean", "default": false }
                        }
                    }
                },
                {
                    "if": {
                        "properties": { "type": { "const": "huggingface" } }
                    },
                    "then": {
                        "required": ["type", "dataset", "mapping"],
                        "additionalProperties": false,
                        "properties": {
                            "type": { "const": "huggingface" },
                            "dataset": { "type": "string" },
                            "split": { "type": "string", "default": "train" },
                            "eval_split": { "type": "string" },
                            "max_samples": { "type": "integer", "minimum": 1 },
                            "text_column": {
                                "type": "string",
                                "default": "text"
                            },
                            "label_column": {
                                "type": "string",
                                "default": "label"
                            },
                            "hf_config": { "type": "string" },
                            "mapping": {
                                "type": "object",
                                "additionalProperties": {
                                    "$ref": "#/$defs/mappingValue"
                                }
                            },
                            "dropout": { "type": "boolean", "default": false }
                        }
                    }
                },
                {
                    "if": { "properties": { "type": { "const": "url" } } },
                    "then": {
                        "required": ["type", "url"],
                        "additionalProperties": false,
                        "properties": {
                            "type": { "const": "url" },
                            "url": { "type": "string", "format": "uri" },
                            "trait": { "type": "string" },
                            "quality": { "$ref": "#/$defs/qualityValue" },
                            "dropout": { "type": "boolean", "default": false }
                        }
                    }
                },
                {
                    "if": { "properties": { "type": { "const": "file" } } },
                    "then": {
                        "required": ["type", "path"],
                        "additionalProperties": false,
                        "properties": {
                            "type": { "const": "file" },
                            "path": { "type": "string" },
                            "trait": { "type": "string" },
                            "quality": { "$ref": "#/$defs/qualityValue" },
                            "dropout": { "type": "boolean", "default": false }
                        }
                    }
                },
                {
                    "if": { "properties": { "type": { "const": "noise" } } },
                    "then": {
                        "required": ["type"],
                        "additionalProperties": false,
                        "properties": {
                            "type": { "const": "noise" },
                            "dataset": { "type": "string" },
                            "split": { "type": "string" },
                            "max_samples": { "type": "integer", "minimum": 1 },
                            "text_column": { "type": "string" },
                            "hf_config": { "type": "string" },
                            "traits": {
                                "type": "array",
                                "items": { "type": "string" },
                                "description": "Which traits to target. Omit or null for all traits."
                            },
                            "quality": {
                                "$ref": "#/$defs/qualityValue",
                                "description": "Polarity of emitted noise samples. Defaults to negative (universal negatives, the original use case). The outlier enricher (DR 028) sets this to 'positive' so noise occupies the outlier-like pole."
                            },
                            "dropout": { "type": "boolean", "default": true }
                        }
                    }
                }
            ]
        },
        "training": {
            "type": "object",
            "additionalProperties": false,
            "properties": {
                "scoring_method": {
                    "type": "string",
                    "enum": [
                        "auto",
                        "centered_cosine",
                        "balanced_cosine",
                        "balanced_linear",
                        "percentile",
                        "weighted_cosine",
                        "weighted_linear",
                        "weighted_fisher",
                        "regression",
                        "ridge"
                    ],
                    "description": "[deprecated] Use top-level `effort` instead. When set, overrides the effort-derived default."
                },
                "test_split": {
                    "type": "number",
                    "minimum": 0.01,
                    "maximum": 0.99,
                    "description": "[deprecated] Auto-determined from data size by the training module. Setting this overrides the auto value."
                },
                "auto_prune": {
                    "type": "boolean",
                    "description": "[deprecated] Use top-level `effort` instead. When set, overrides the effort-derived default."
                },
                "prune_z_threshold": {
                    "type": "number",
                    "minimum": 0,
                    "description": "[deprecated] Auto-determined from data distribution. Setting this overrides the auto value."
                },
                "cold_start_k": {
                    "type": "array",
                    "items": { "type": "integer", "minimum": 1 },
                    "description": "[deprecated] Use top-level `effort` instead. When set, overrides the effort-derived default."
                },
                "stability_bootstrap_n": {
                    "type": "integer",
                    "minimum": 1,
                    "description": "[deprecated] Use top-level `effort` instead. When set, overrides the effort-derived default."
                },
                "trials_per_method": {
                    "type": "integer",
                    "minimum": 1,
                    "description": "[deprecated] Use top-level `effort` instead. Optuna trial budget per scoring method; setting this overrides the effort-derived default."
                },
                "early_stopping_patience": {
                    "type": "integer",
                    "minimum": 1,
                    "description": "[deprecated] Use top-level `effort` instead. Setting this overrides the effort-derived default."
                },
                "compute_diagnostics": {
                    "type": "boolean",
                    "description": "[deprecated] Use top-level `effort` instead. Whether to run cold-start / stability / correlation diagnostics during training. Setting this overrides the effort-derived default."
                },
                "max_parallel_methods": {
                    "type": "integer",
                    "minimum": 1,
                    "default": 4,
                    "description": "Max concurrent method studies during optimization. Host concurrency, not effort-controlled."
                },
                "boundary_low": {
                    "type": "number",
                    "minimum": 0,
                    "maximum": 50,
                    "description": "Fixed low percentile boundary. Skips boundary sweep when set with boundary_high."
                },
                "boundary_high": {
                    "type": "number",
                    "minimum": 50,
                    "maximum": 100,
                    "description": "Fixed high percentile boundary. Skips boundary sweep when set with boundary_low."
                },
                "boundary_mode": {
                    "type": "string",
                    "enum": ["exclude", "interpolate"],
                    "default": "exclude",
                    "description": "How to handle samples between boundaries. Only used when boundary_low/high are set."
                }
            }
        },
        "discovery": {
            "type": "object",
            "additionalProperties": false,
            "properties": {
                "method": {
                    "type": "string",
                    "enum": ["auto", "pca", "lda", "clustering"],
                    "default": "auto",
                    "description": "Author override of the auto method selection."
                },
                "min_discrimination": {
                    "oneOf": [
                        { "const": "auto" },
                        { "type": "number", "minimum": 0, "maximum": 1 }
                    ],
                    "default": "auto",
                    "description": "Author override of the auto threshold sweep."
                },
                "min_similarity": {
                    "type": "number",
                    "minimum": 0,
                    "maximum": 1,
                    "description": "Author override of the auto-determined similarity floor for interpretation samples. Samples below this floor are filtered out before reaching the LLM."
                },
                "interpret_context": {
                    "type": "string",
                    "description": "Optional scenario-specific framing piped into the interpretation prompt."
                },
                "max_traits": {
                    "type": "integer",
                    "minimum": 1,
                    "description": "[deprecated] Use top-level `effort` instead. When set, overrides the effort-derived default."
                },
                "optimize": {
                    "type": "string",
                    "enum": ["quick", "full"],
                    "description": "[deprecated] Use top-level `effort` instead. When set, overrides the effort-derived default."
                },
                "full_mode_top_n": {
                    "type": "integer",
                    "minimum": 1,
                    "description": "[deprecated] Use top-level `effort` instead. When set, overrides the effort-derived default."
                },
                "interpret": {
                    "type": "boolean",
                    "description": "[deprecated] Use top-level `effort` instead. When set, overrides the effort-derived default."
                },
                "interpret_top_k": {
                    "type": "integer",
                    "minimum": 1,
                    "description": "[deprecated] Use top-level `effort` instead. When set, overrides the effort-derived default."
                },
                "interpret_max_chars": {
                    "type": "integer",
                    "minimum": 1,
                    "description": "[deprecated] Use top-level `effort` instead. When set, overrides the effort-derived default."
                }
            }
        }
    }
}