From 8540cdba92d8581c2eabd6507021663360726ca3 Mon Sep 17 00:00:00 2001
From: whoisthey <joedimagio23@proton.me>
Date: Thu, 25 Jun 2026 20:58:07 -0700
Subject: [PATCH 01/10] feat(web): add language model inputModalities
 capability plumbing

Add an optional `inputModalities` declaration to language model config and
expose a resolved capability set to the client.

- Schema: add optional `inputModalities` (`text` | `image` | `pdf`) to every
  provider definition in `schemas/v3/languageModel.json` and regenerate the
  schema types/snippets.
- Add a fail-closed `resolveModelInputModalities` resolver that defaults to
  text-only when a model does not declare its input modalities.
- Expose the resolved `inputModalities` on the client-safe `LanguageModelInfo`
  (populated via `getConfiguredLanguageModelsInfo` and the MCP ask path).

This is groundwork for chat file attachments. It adds no attachment UI and no
live provider capability probing yet.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 docs/snippets/schemas/v3/index.schema.mdx     | 264 ++++++++++++++++++
 .../schemas/v3/languageModel.schema.mdx       | 264 ++++++++++++++++++
 packages/schemas/src/v3/index.schema.ts       | 264 ++++++++++++++++++
 packages/schemas/src/v3/index.type.ts         |  48 ++++
 .../schemas/src/v3/languageModel.schema.ts    | 264 ++++++++++++++++++
 packages/schemas/src/v3/languageModel.type.ts |  48 ++++
 .../web/src/ee/features/mcp/askCodebase.ts    |   2 +
 .../src/features/chat/modelCapabilities.ts    |  13 +
 packages/web/src/features/chat/types.ts       |   4 +
 .../web/src/features/chat/utils.server.ts     |   2 +
 schemas/v3/languageModel.json                 | 134 ++++++++-
 11 files changed, 1306 insertions(+), 1 deletion(-)
 create mode 100644 packages/web/src/features/chat/modelCapabilities.ts

diff --git a/docs/snippets/schemas/v3/index.schema.mdx b/docs/snippets/schemas/v3/index.schema.mdx
index 864359251..e0b00c540 100644
--- a/docs/snippets/schemas/v3/index.schema.mdx
+++ b/docs/snippets/schemas/v3/index.schema.mdx
@@ -1860,6 +1860,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -1998,6 +2009,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -2133,6 +2155,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -2240,6 +2273,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -2361,6 +2405,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -2484,6 +2539,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -2623,6 +2689,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -2730,6 +2807,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -2863,6 +2951,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -3027,6 +3126,17 @@
               "temperature": {
                 "type": "number",
                 "description": "Optional temperature setting to use with the model."
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -3135,6 +3245,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -3246,6 +3367,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -3426,6 +3558,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -3564,6 +3707,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -3699,6 +3853,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -3806,6 +3971,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -3927,6 +4103,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -4050,6 +4237,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -4189,6 +4387,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -4296,6 +4505,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -4429,6 +4649,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -4593,6 +4824,17 @@
               "temperature": {
                 "type": "number",
                 "description": "Optional temperature setting to use with the model."
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -4701,6 +4943,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -4812,6 +5065,17 @@
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
diff --git a/docs/snippets/schemas/v3/languageModel.schema.mdx b/docs/snippets/schemas/v3/languageModel.schema.mdx
index 90aee08af..7c7874207 100644
--- a/docs/snippets/schemas/v3/languageModel.schema.mdx
+++ b/docs/snippets/schemas/v3/languageModel.schema.mdx
@@ -174,6 +174,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -312,6 +323,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -447,6 +469,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -554,6 +587,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -675,6 +719,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -798,6 +853,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -937,6 +1003,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -1044,6 +1121,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -1177,6 +1265,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -1341,6 +1440,17 @@
         "temperature": {
           "type": "number",
           "description": "Optional temperature setting to use with the model."
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -1449,6 +1559,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -1560,6 +1681,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -1740,6 +1872,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -1878,6 +2021,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -2013,6 +2167,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -2120,6 +2285,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -2241,6 +2417,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -2364,6 +2551,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -2503,6 +2701,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -2610,6 +2819,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -2743,6 +2963,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -2907,6 +3138,17 @@
         "temperature": {
           "type": "number",
           "description": "Optional temperature setting to use with the model."
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -3015,6 +3257,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -3126,6 +3379,17 @@
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
diff --git a/packages/schemas/src/v3/index.schema.ts b/packages/schemas/src/v3/index.schema.ts
index 8c1d64b52..257c8ae7d 100644
--- a/packages/schemas/src/v3/index.schema.ts
+++ b/packages/schemas/src/v3/index.schema.ts
@@ -1859,6 +1859,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -1997,6 +2008,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -2132,6 +2154,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -2239,6 +2272,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -2360,6 +2404,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -2483,6 +2538,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -2622,6 +2688,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -2729,6 +2806,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -2862,6 +2950,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -3026,6 +3125,17 @@ const schema = {
               "temperature": {
                 "type": "number",
                 "description": "Optional temperature setting to use with the model."
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -3134,6 +3244,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -3245,6 +3366,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -3425,6 +3557,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -3563,6 +3706,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -3698,6 +3852,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -3805,6 +3970,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -3926,6 +4102,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -4049,6 +4236,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -4188,6 +4386,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -4295,6 +4504,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -4428,6 +4648,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -4592,6 +4823,17 @@ const schema = {
               "temperature": {
                 "type": "number",
                 "description": "Optional temperature setting to use with the model."
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -4700,6 +4942,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
@@ -4811,6 +5064,17 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
+              },
+              "inputModalities": {
+                "type": "array",
+                "items": {
+                  "enum": [
+                    "text",
+                    "image",
+                    "pdf"
+                  ]
+                },
+                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
               }
             },
             "required": [
diff --git a/packages/schemas/src/v3/index.type.ts b/packages/schemas/src/v3/index.type.ts
index 7fa7f5a17..85dbaac43 100644
--- a/packages/schemas/src/v3/index.type.ts
+++ b/packages/schemas/src/v3/index.type.ts
@@ -762,6 +762,10 @@ export interface AmazonBedrockLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 /**
  * Optional headers to use with the model.
@@ -842,6 +846,10 @@ export interface AnthropicLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 export interface AzureLanguageModel {
   /**
@@ -897,6 +905,10 @@ export interface AzureLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 export interface DeepSeekLanguageModel {
   /**
@@ -936,6 +948,10 @@ export interface DeepSeekLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 export interface GoogleGenerativeAILanguageModel {
   /**
@@ -983,6 +999,10 @@ export interface GoogleGenerativeAILanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 export interface GoogleVertexAnthropicLanguageModel {
   /**
@@ -1030,6 +1050,10 @@ export interface GoogleVertexAnthropicLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 export interface GoogleVertexLanguageModel {
   /**
@@ -1085,6 +1109,10 @@ export interface GoogleVertexLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 export interface MistralLanguageModel {
   /**
@@ -1124,6 +1152,10 @@ export interface MistralLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 export interface OpenAILanguageModel {
   /**
@@ -1171,6 +1203,10 @@ export interface OpenAILanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 export interface OpenAICompatibleLanguageModel {
   /**
@@ -1215,6 +1251,10 @@ export interface OpenAICompatibleLanguageModel {
    * Optional temperature setting to use with the model.
    */
   temperature?: number;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 /**
  * Optional query parameters to include in the request url.
@@ -1279,6 +1319,10 @@ export interface OpenRouterLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 export interface XaiLanguageModel {
   /**
@@ -1318,6 +1362,10 @@ export interface XaiLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 export interface GitHubAppConfig {
   /**
diff --git a/packages/schemas/src/v3/languageModel.schema.ts b/packages/schemas/src/v3/languageModel.schema.ts
index ab418ce79..85c2bf8a8 100644
--- a/packages/schemas/src/v3/languageModel.schema.ts
+++ b/packages/schemas/src/v3/languageModel.schema.ts
@@ -173,6 +173,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -311,6 +322,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -446,6 +468,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -553,6 +586,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -674,6 +718,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -797,6 +852,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -936,6 +1002,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -1043,6 +1120,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -1176,6 +1264,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -1340,6 +1439,17 @@ const schema = {
         "temperature": {
           "type": "number",
           "description": "Optional temperature setting to use with the model."
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -1448,6 +1558,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -1559,6 +1680,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -1739,6 +1871,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -1877,6 +2020,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -2012,6 +2166,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -2119,6 +2284,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -2240,6 +2416,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -2363,6 +2550,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -2502,6 +2700,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -2609,6 +2818,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -2742,6 +2962,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -2906,6 +3137,17 @@ const schema = {
         "temperature": {
           "type": "number",
           "description": "Optional temperature setting to use with the model."
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -3014,6 +3256,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
@@ -3125,6 +3378,17 @@ const schema = {
             }
           },
           "additionalProperties": false
+        },
+        "inputModalities": {
+          "type": "array",
+          "items": {
+            "enum": [
+              "text",
+              "image",
+              "pdf"
+            ]
+          },
+          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
         }
       },
       "required": [
diff --git a/packages/schemas/src/v3/languageModel.type.ts b/packages/schemas/src/v3/languageModel.type.ts
index 5c3b25668..df4569ee8 100644
--- a/packages/schemas/src/v3/languageModel.type.ts
+++ b/packages/schemas/src/v3/languageModel.type.ts
@@ -88,6 +88,10 @@ export interface AmazonBedrockLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 /**
  * Optional headers to use with the model.
@@ -168,6 +172,10 @@ export interface AnthropicLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 export interface AzureLanguageModel {
   /**
@@ -223,6 +231,10 @@ export interface AzureLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 export interface DeepSeekLanguageModel {
   /**
@@ -262,6 +274,10 @@ export interface DeepSeekLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 export interface GoogleGenerativeAILanguageModel {
   /**
@@ -309,6 +325,10 @@ export interface GoogleGenerativeAILanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 export interface GoogleVertexAnthropicLanguageModel {
   /**
@@ -356,6 +376,10 @@ export interface GoogleVertexAnthropicLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 export interface GoogleVertexLanguageModel {
   /**
@@ -411,6 +435,10 @@ export interface GoogleVertexLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 export interface MistralLanguageModel {
   /**
@@ -450,6 +478,10 @@ export interface MistralLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 export interface OpenAILanguageModel {
   /**
@@ -497,6 +529,10 @@ export interface OpenAILanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 export interface OpenAICompatibleLanguageModel {
   /**
@@ -541,6 +577,10 @@ export interface OpenAICompatibleLanguageModel {
    * Optional temperature setting to use with the model.
    */
   temperature?: number;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 /**
  * Optional query parameters to include in the request url.
@@ -605,6 +645,10 @@ export interface OpenRouterLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
 export interface XaiLanguageModel {
   /**
@@ -644,4 +688,8 @@ export interface XaiLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
+  /**
+   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "pdf")[];
 }
diff --git a/packages/web/src/ee/features/mcp/askCodebase.ts b/packages/web/src/ee/features/mcp/askCodebase.ts
index 4b7cfb7b0..8b2432fb5 100644
--- a/packages/web/src/ee/features/mcp/askCodebase.ts
+++ b/packages/web/src/ee/features/mcp/askCodebase.ts
@@ -4,6 +4,7 @@ import { generateChatNameFromMessage } from "@/ee/features/chat/llm.server";
 import { getAISDKLanguageModelAndOptions } from "@/features/chat/llm.server";
 import { LanguageModelInfo, SBChatMessage, SearchScope } from "@/features/chat/types";
 import { convertLLMOutputToPortableMarkdown, getAnswerPartFromAssistantMessage, getLanguageModelKey } from "@/features/chat/utils";
+import { resolveModelInputModalities } from "@/features/chat/modelCapabilities";
 import { ErrorCode } from "@/lib/errorCodes";
 import { ServiceError, ServiceErrorException } from "@/lib/serviceError";
 import { withOptionalAuth } from "@/middleware/withAuth";
@@ -243,6 +244,7 @@ export const askCodebase = (params: AskCodebaseParams): Promise<AskCodebaseResul
                     provider: languageModelConfig.provider,
                     model: languageModelConfig.model,
                     displayName: languageModelConfig.displayName,
+                    inputModalities: resolveModelInputModalities(languageModelConfig),
                 },
             } satisfies AskCodebaseResult;
         })
diff --git a/packages/web/src/features/chat/modelCapabilities.ts b/packages/web/src/features/chat/modelCapabilities.ts
new file mode 100644
index 000000000..4dbe9bcd6
--- /dev/null
+++ b/packages/web/src/features/chat/modelCapabilities.ts
@@ -0,0 +1,13 @@
+import { LanguageModel } from '@sourcebot/schemas/v3/languageModel.type';
+import { InputModality } from './types';
+
+// Fail-closed: when a model does not declare input modalities, assume text-only.
+// NOTE: future work may add live provider capability probing (see
+// tryResolveAnthropicThinkingConfig in llm.server.ts for the precedent).
+export const resolveModelInputModalities = (config: LanguageModel): InputModality[] => {
+    const declared = config.inputModalities;
+    if (declared && declared.length > 0) {
+        return declared;
+    }
+    return ['text'];
+}
diff --git a/packages/web/src/features/chat/types.ts b/packages/web/src/features/chat/types.ts
index 38a737a09..615fe2b1c 100644
--- a/packages/web/src/features/chat/types.ts
+++ b/packages/web/src/features/chat/types.ts
@@ -208,10 +208,13 @@ type _AssertAllProviders = LanguageModelProvider extends typeof languageModelPro
 const _assertAllProviders: _AssertAllProviders = true;
 void _assertAllProviders;
 
+export type InputModality = 'text' | 'image' | 'pdf';
+
 export const languageModelInfoSchema = z.object({
     provider: z.enum(languageModelProviders).describe("The model provider (e.g., 'anthropic', 'openai')"),
     model: z.string().describe("The model ID"),
     displayName: z.string().optional().describe("Optional display name for the model"),
+    inputModalities: z.array(z.enum(['text', 'image', 'pdf'])).default(['text']).describe("The input modalities the model can accept. Defaults to text-only."),
 });
 
 /**
@@ -221,6 +224,7 @@ export type LanguageModelInfo = {
     provider: LanguageModelProvider,
     model: LanguageModel['model'],
     displayName?: LanguageModel['displayName'],
+    inputModalities: InputModality[],
 }
 
 // Additional request body data that we send along to the chat API.
diff --git a/packages/web/src/features/chat/utils.server.ts b/packages/web/src/features/chat/utils.server.ts
index ffc3483a4..7ec47b677 100644
--- a/packages/web/src/features/chat/utils.server.ts
+++ b/packages/web/src/features/chat/utils.server.ts
@@ -7,6 +7,7 @@ import { env, loadConfig } from '@sourcebot/shared';
 import fs from 'fs';
 import path from 'path';
 import { LanguageModelInfo, SBChatMessage } from './types';
+import { resolveModelInputModalities } from './modelCapabilities';
 import { hasEntitlement } from '@/lib/entitlements';
 import { ServiceError } from '@/lib/serviceError';
 import { ErrorCode } from '@/lib/errorCodes';
@@ -131,5 +132,6 @@ export const getConfiguredLanguageModelsInfo = async () => {
         provider: model.provider,
         model: model.model,
         displayName: model.displayName,
+        inputModalities: resolveModelInputModalities(model),
     }));
 };
diff --git a/schemas/v3/languageModel.json b/schemas/v3/languageModel.json
index 3f1d13d52..0fb96217a 100644
--- a/schemas/v3/languageModel.json
+++ b/schemas/v3/languageModel.json
@@ -50,6 +50,17 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
+                },
+                "inputModalities": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
+                            "text",
+                            "image",
+                            "pdf"
+                        ]
+                    },
+                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
                 }
             },
             "required": [
@@ -93,6 +104,17 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
+                },
+                "inputModalities": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
+                            "text",
+                            "image",
+                            "pdf"
+                        ]
+                    },
+                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
                 }
             },
             "required": [
@@ -160,6 +182,17 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
+                },
+                "inputModalities": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
+                            "text",
+                            "image",
+                            "pdf"
+                        ]
+                    },
+                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
                 }
             },
             "required": [
@@ -199,6 +232,17 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
+                },
+                "inputModalities": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
+                            "text",
+                            "image",
+                            "pdf"
+                        ]
+                    },
+                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
                 }
             },
             "required": [
@@ -252,6 +296,17 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
+                },
+                "inputModalities": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
+                            "text",
+                            "image",
+                            "pdf"
+                        ]
+                    },
+                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
                 }
             },
             "required": [
@@ -307,6 +362,17 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
+                },
+                "inputModalities": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
+                            "text",
+                            "image",
+                            "pdf"
+                        ]
+                    },
+                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
                 }
             },
             "required": [
@@ -378,6 +444,17 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
+                },
+                "inputModalities": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
+                            "text",
+                            "image",
+                            "pdf"
+                        ]
+                    },
+                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
                 }
             },
             "required": [
@@ -417,6 +494,17 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
+                },
+                "inputModalities": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
+                            "text",
+                            "image",
+                            "pdf"
+                        ]
+                    },
+                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
                 }
             },
             "required": [
@@ -482,6 +570,17 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
+                },
+                "inputModalities": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
+                            "text",
+                            "image",
+                            "pdf"
+                        ]
+                    },
+                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
                 }
             },
             "required": [
@@ -537,6 +636,17 @@
                 "temperature": {
                     "type": "number",
                     "description": "Optional temperature setting to use with the model."
+                },
+                "inputModalities": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
+                            "text",
+                            "image",
+                            "pdf"
+                        ]
+                    },
+                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
                 }
             },
             "required": [
@@ -577,6 +687,17 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
+                },
+                "inputModalities": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
+                            "text",
+                            "image",
+                            "pdf"
+                        ]
+                    },
+                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
                 }
             },
             "required": [
@@ -620,6 +741,17 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
+                },
+                "inputModalities": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
+                            "text",
+                            "image",
+                            "pdf"
+                        ]
+                    },
+                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
                 }
             },
             "required": [
@@ -667,4 +799,4 @@
             "$ref": "#/definitions/XaiLanguageModel"
         }
     ]
-}
\ No newline at end of file
+}

From a473b49cd8de430e00183305a563f34dba39c113 Mon Sep 17 00:00:00 2001
From: whoisthey <joedimagio23@proton.me>
Date: Thu, 25 Jun 2026 20:58:48 -0700
Subject: [PATCH 02/10] docs: add CHANGELOG entry for language model
 inputModalities

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 689718d36..5163f833a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 - Added per-step token cost tracking and estimated tool call token usage to Ask Sourcebot chat history. [#1353](https://github.com/sourcebot-dev/sourcebot/pull/1353)
+- Added optional `inputModalities` configuration for language models, exposing model input-modality capabilities (defaults to text-only). [#1372](https://github.com/sourcebot-dev/sourcebot/pull/1372)
 
 ### Fixed
 - Send anonymous server-side PostHog events as personless so unauthenticated requests don't inflate person counts. [#1367](https://github.com/sourcebot-dev/sourcebot/pull/1367)

From 4b57d279bea951a86e806b46e555a30eebe615dc Mon Sep 17 00:00:00 2001
From: whoisthey <joedimagio23@proton.me>
Date: Fri, 26 Jun 2026 10:08:53 -0700
Subject: [PATCH 03/10] refactor(schemas): split document types out of
 inputModalities

inputModalities now only enumerates true perceptual channels
(text | image | audio | video). Document/container formats like PDF
move to a separate fail-closed `supportedDocumentTypes` field, since
PDF is not a model modality but a format providers decompose into
text/image internally.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 CHANGELOG.md                                  |   2 +-
 docs/snippets/schemas/v3/index.schema.mdx     | 288 ++++++++++++++++--
 .../schemas/v3/languageModel.schema.mdx       | 288 ++++++++++++++++--
 packages/schemas/src/v3/index.schema.ts       | 288 ++++++++++++++++--
 packages/schemas/src/v3/index.type.ts         |  96 ++++--
 .../schemas/src/v3/languageModel.schema.ts    | 288 ++++++++++++++++--
 packages/schemas/src/v3/languageModel.type.ts |  96 ++++--
 .../web/src/ee/features/mcp/askCodebase.ts    |   3 +-
 .../src/features/chat/modelCapabilities.ts    |  13 +-
 packages/web/src/features/chat/types.ts       |   7 +-
 .../web/src/features/chat/utils.server.ts     |   3 +-
 schemas/v3/languageModel.json                 | 144 ++++++++-
 12 files changed, 1354 insertions(+), 162 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5163f833a..caa90e9b1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,7 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 - Added per-step token cost tracking and estimated tool call token usage to Ask Sourcebot chat history. [#1353](https://github.com/sourcebot-dev/sourcebot/pull/1353)
-- Added optional `inputModalities` configuration for language models, exposing model input-modality capabilities (defaults to text-only). [#1372](https://github.com/sourcebot-dev/sourcebot/pull/1372)
+- Added optional `inputModalities` and `supportedDocumentTypes` configuration for language models, exposing model input-modality and document capabilities (defaults to text-only, no documents). [#1372](https://github.com/sourcebot-dev/sourcebot/pull/1372)
 
 ### Fixed
 - Send anonymous server-side PostHog events as personless so unauthenticated requests don't inflate person counts. [#1367](https://github.com/sourcebot-dev/sourcebot/pull/1367)
diff --git a/docs/snippets/schemas/v3/index.schema.mdx b/docs/snippets/schemas/v3/index.schema.mdx
index e0b00c540..5b099d724 100644
--- a/docs/snippets/schemas/v3/index.schema.mdx
+++ b/docs/snippets/schemas/v3/index.schema.mdx
@@ -1867,10 +1867,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -2016,10 +2026,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -2162,10 +2182,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -2280,10 +2310,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -2412,10 +2452,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -2546,10 +2596,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -2696,10 +2756,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -2814,10 +2884,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -2958,10 +3038,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -3133,10 +3223,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -3252,10 +3352,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -3374,10 +3484,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -3565,10 +3685,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -3714,10 +3844,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -3860,10 +4000,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -3978,10 +4128,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -4110,10 +4270,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -4244,10 +4414,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -4394,10 +4574,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -4512,10 +4702,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -4656,10 +4856,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -4831,10 +5041,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -4950,10 +5170,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -5072,10 +5302,20 @@
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
diff --git a/docs/snippets/schemas/v3/languageModel.schema.mdx b/docs/snippets/schemas/v3/languageModel.schema.mdx
index 7c7874207..7b1e774cf 100644
--- a/docs/snippets/schemas/v3/languageModel.schema.mdx
+++ b/docs/snippets/schemas/v3/languageModel.schema.mdx
@@ -181,10 +181,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -330,10 +340,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -476,10 +496,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -594,10 +624,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -726,10 +766,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -860,10 +910,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -1010,10 +1070,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -1128,10 +1198,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -1272,10 +1352,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -1447,10 +1537,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -1566,10 +1666,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -1688,10 +1798,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -1879,10 +1999,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -2028,10 +2158,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -2174,10 +2314,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -2292,10 +2442,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -2424,10 +2584,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -2558,10 +2728,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -2708,10 +2888,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -2826,10 +3016,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -2970,10 +3170,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -3145,10 +3355,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -3264,10 +3484,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -3386,10 +3616,20 @@
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
diff --git a/packages/schemas/src/v3/index.schema.ts b/packages/schemas/src/v3/index.schema.ts
index 257c8ae7d..7d051544c 100644
--- a/packages/schemas/src/v3/index.schema.ts
+++ b/packages/schemas/src/v3/index.schema.ts
@@ -1866,10 +1866,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -2015,10 +2025,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -2161,10 +2181,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -2279,10 +2309,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -2411,10 +2451,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -2545,10 +2595,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -2695,10 +2755,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -2813,10 +2883,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -2957,10 +3037,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -3132,10 +3222,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -3251,10 +3351,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -3373,10 +3483,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -3564,10 +3684,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -3713,10 +3843,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -3859,10 +3999,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -3977,10 +4127,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -4109,10 +4269,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -4243,10 +4413,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -4393,10 +4573,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -4511,10 +4701,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -4655,10 +4855,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -4830,10 +5040,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -4949,10 +5169,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
@@ -5071,10 +5301,20 @@ const schema = {
                   "enum": [
                     "text",
                     "image",
+                    "audio",
+                    "video"
+                  ]
+                },
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+              },
+              "supportedDocumentTypes": {
+                "type": "array",
+                "items": {
+                  "enum": [
                     "pdf"
                   ]
                 },
-                "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
               }
             },
             "required": [
diff --git a/packages/schemas/src/v3/index.type.ts b/packages/schemas/src/v3/index.type.ts
index 85dbaac43..14c8c14e2 100644
--- a/packages/schemas/src/v3/index.type.ts
+++ b/packages/schemas/src/v3/index.type.ts
@@ -763,9 +763,13 @@ export interface AmazonBedrockLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   */
+  supportedDocumentTypes?: "pdf"[];
 }
 /**
  * Optional headers to use with the model.
@@ -847,9 +851,13 @@ export interface AnthropicLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  supportedDocumentTypes?: "pdf"[];
 }
 export interface AzureLanguageModel {
   /**
@@ -906,9 +914,13 @@ export interface AzureLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  supportedDocumentTypes?: "pdf"[];
 }
 export interface DeepSeekLanguageModel {
   /**
@@ -949,9 +961,13 @@ export interface DeepSeekLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   */
+  supportedDocumentTypes?: "pdf"[];
 }
 export interface GoogleGenerativeAILanguageModel {
   /**
@@ -1000,9 +1016,13 @@ export interface GoogleGenerativeAILanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  supportedDocumentTypes?: "pdf"[];
 }
 export interface GoogleVertexAnthropicLanguageModel {
   /**
@@ -1051,9 +1071,13 @@ export interface GoogleVertexAnthropicLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  supportedDocumentTypes?: "pdf"[];
 }
 export interface GoogleVertexLanguageModel {
   /**
@@ -1110,9 +1134,13 @@ export interface GoogleVertexLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   */
+  supportedDocumentTypes?: "pdf"[];
 }
 export interface MistralLanguageModel {
   /**
@@ -1153,9 +1181,13 @@ export interface MistralLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  supportedDocumentTypes?: "pdf"[];
 }
 export interface OpenAILanguageModel {
   /**
@@ -1204,9 +1236,13 @@ export interface OpenAILanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  supportedDocumentTypes?: "pdf"[];
 }
 export interface OpenAICompatibleLanguageModel {
   /**
@@ -1252,9 +1288,13 @@ export interface OpenAICompatibleLanguageModel {
    */
   temperature?: number;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   */
+  supportedDocumentTypes?: "pdf"[];
 }
 /**
  * Optional query parameters to include in the request url.
@@ -1320,9 +1360,13 @@ export interface OpenRouterLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  supportedDocumentTypes?: "pdf"[];
 }
 export interface XaiLanguageModel {
   /**
@@ -1363,9 +1407,13 @@ export interface XaiLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  supportedDocumentTypes?: "pdf"[];
 }
 export interface GitHubAppConfig {
   /**
diff --git a/packages/schemas/src/v3/languageModel.schema.ts b/packages/schemas/src/v3/languageModel.schema.ts
index 85c2bf8a8..9c9ae7b2d 100644
--- a/packages/schemas/src/v3/languageModel.schema.ts
+++ b/packages/schemas/src/v3/languageModel.schema.ts
@@ -180,10 +180,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -329,10 +339,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -475,10 +495,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -593,10 +623,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -725,10 +765,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -859,10 +909,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -1009,10 +1069,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -1127,10 +1197,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -1271,10 +1351,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -1446,10 +1536,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -1565,10 +1665,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -1687,10 +1797,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -1878,10 +1998,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -2027,10 +2157,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -2173,10 +2313,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -2291,10 +2441,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -2423,10 +2583,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -2557,10 +2727,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -2707,10 +2887,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -2825,10 +3015,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -2969,10 +3169,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -3144,10 +3354,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -3263,10 +3483,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
@@ -3385,10 +3615,20 @@ const schema = {
             "enum": [
               "text",
               "image",
+              "audio",
+              "video"
+            ]
+          },
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+        },
+        "supportedDocumentTypes": {
+          "type": "array",
+          "items": {
+            "enum": [
               "pdf"
             ]
           },
-          "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
         }
       },
       "required": [
diff --git a/packages/schemas/src/v3/languageModel.type.ts b/packages/schemas/src/v3/languageModel.type.ts
index df4569ee8..3297689b7 100644
--- a/packages/schemas/src/v3/languageModel.type.ts
+++ b/packages/schemas/src/v3/languageModel.type.ts
@@ -89,9 +89,13 @@ export interface AmazonBedrockLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   */
+  supportedDocumentTypes?: "pdf"[];
 }
 /**
  * Optional headers to use with the model.
@@ -173,9 +177,13 @@ export interface AnthropicLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  supportedDocumentTypes?: "pdf"[];
 }
 export interface AzureLanguageModel {
   /**
@@ -232,9 +240,13 @@ export interface AzureLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  supportedDocumentTypes?: "pdf"[];
 }
 export interface DeepSeekLanguageModel {
   /**
@@ -275,9 +287,13 @@ export interface DeepSeekLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   */
+  supportedDocumentTypes?: "pdf"[];
 }
 export interface GoogleGenerativeAILanguageModel {
   /**
@@ -326,9 +342,13 @@ export interface GoogleGenerativeAILanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  supportedDocumentTypes?: "pdf"[];
 }
 export interface GoogleVertexAnthropicLanguageModel {
   /**
@@ -377,9 +397,13 @@ export interface GoogleVertexAnthropicLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  supportedDocumentTypes?: "pdf"[];
 }
 export interface GoogleVertexLanguageModel {
   /**
@@ -436,9 +460,13 @@ export interface GoogleVertexLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   */
+  supportedDocumentTypes?: "pdf"[];
 }
 export interface MistralLanguageModel {
   /**
@@ -479,9 +507,13 @@ export interface MistralLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  supportedDocumentTypes?: "pdf"[];
 }
 export interface OpenAILanguageModel {
   /**
@@ -530,9 +562,13 @@ export interface OpenAILanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  supportedDocumentTypes?: "pdf"[];
 }
 export interface OpenAICompatibleLanguageModel {
   /**
@@ -578,9 +614,13 @@ export interface OpenAICompatibleLanguageModel {
    */
   temperature?: number;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   */
+  supportedDocumentTypes?: "pdf"[];
 }
 /**
  * Optional query parameters to include in the request url.
@@ -646,9 +686,13 @@ export interface OpenRouterLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  supportedDocumentTypes?: "pdf"[];
 }
 export interface XaiLanguageModel {
   /**
@@ -689,7 +733,11 @@ export interface XaiLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   */
+  inputModalities?: ("text" | "image" | "audio" | "video")[];
+  /**
+   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
    */
-  inputModalities?: ("text" | "image" | "pdf")[];
+  supportedDocumentTypes?: "pdf"[];
 }
diff --git a/packages/web/src/ee/features/mcp/askCodebase.ts b/packages/web/src/ee/features/mcp/askCodebase.ts
index 8b2432fb5..7f779ffc8 100644
--- a/packages/web/src/ee/features/mcp/askCodebase.ts
+++ b/packages/web/src/ee/features/mcp/askCodebase.ts
@@ -4,7 +4,7 @@ import { generateChatNameFromMessage } from "@/ee/features/chat/llm.server";
 import { getAISDKLanguageModelAndOptions } from "@/features/chat/llm.server";
 import { LanguageModelInfo, SBChatMessage, SearchScope } from "@/features/chat/types";
 import { convertLLMOutputToPortableMarkdown, getAnswerPartFromAssistantMessage, getLanguageModelKey } from "@/features/chat/utils";
-import { resolveModelInputModalities } from "@/features/chat/modelCapabilities";
+import { resolveModelInputModalities, resolveModelSupportedDocumentTypes } from "@/features/chat/modelCapabilities";
 import { ErrorCode } from "@/lib/errorCodes";
 import { ServiceError, ServiceErrorException } from "@/lib/serviceError";
 import { withOptionalAuth } from "@/middleware/withAuth";
@@ -245,6 +245,7 @@ export const askCodebase = (params: AskCodebaseParams): Promise<AskCodebaseResul
                     model: languageModelConfig.model,
                     displayName: languageModelConfig.displayName,
                     inputModalities: resolveModelInputModalities(languageModelConfig),
+                    supportedDocumentTypes: resolveModelSupportedDocumentTypes(languageModelConfig),
                 },
             } satisfies AskCodebaseResult;
         })
diff --git a/packages/web/src/features/chat/modelCapabilities.ts b/packages/web/src/features/chat/modelCapabilities.ts
index 4dbe9bcd6..8b976af59 100644
--- a/packages/web/src/features/chat/modelCapabilities.ts
+++ b/packages/web/src/features/chat/modelCapabilities.ts
@@ -1,5 +1,5 @@
 import { LanguageModel } from '@sourcebot/schemas/v3/languageModel.type';
-import { InputModality } from './types';
+import { DocumentType, InputModality } from './types';
 
 // Fail-closed: when a model does not declare input modalities, assume text-only.
 // NOTE: future work may add live provider capability probing (see
@@ -11,3 +11,14 @@ export const resolveModelInputModalities = (config: LanguageModel): InputModalit
     }
     return ['text'];
 }
+
+// Fail-closed: when a model does not declare supported document types, assume none.
+// Document types (e.g. PDF) are container formats distinct from raw input
+// modalities, since providers decompose them into text/image internally.
+export const resolveModelSupportedDocumentTypes = (config: LanguageModel): DocumentType[] => {
+    const declared = config.supportedDocumentTypes;
+    if (declared && declared.length > 0) {
+        return declared;
+    }
+    return [];
+}
diff --git a/packages/web/src/features/chat/types.ts b/packages/web/src/features/chat/types.ts
index 615fe2b1c..e1daf0bdb 100644
--- a/packages/web/src/features/chat/types.ts
+++ b/packages/web/src/features/chat/types.ts
@@ -208,13 +208,15 @@ type _AssertAllProviders = LanguageModelProvider extends typeof languageModelPro
 const _assertAllProviders: _AssertAllProviders = true;
 void _assertAllProviders;
 
-export type InputModality = 'text' | 'image' | 'pdf';
+export type InputModality = 'text' | 'image' | 'audio' | 'video';
+export type DocumentType = 'pdf';
 
 export const languageModelInfoSchema = z.object({
     provider: z.enum(languageModelProviders).describe("The model provider (e.g., 'anthropic', 'openai')"),
     model: z.string().describe("The model ID"),
     displayName: z.string().optional().describe("Optional display name for the model"),
-    inputModalities: z.array(z.enum(['text', 'image', 'pdf'])).default(['text']).describe("The input modalities the model can accept. Defaults to text-only."),
+    inputModalities: z.array(z.enum(['text', 'image', 'audio', 'video'])).default(['text']).describe("The input modalities the model can accept. Defaults to text-only."),
+    supportedDocumentTypes: z.array(z.enum(['pdf'])).default([]).describe("The document/file container formats the model can ingest natively. Defaults to none."),
 });
 
 /**
@@ -225,6 +227,7 @@ export type LanguageModelInfo = {
     model: LanguageModel['model'],
     displayName?: LanguageModel['displayName'],
     inputModalities: InputModality[],
+    supportedDocumentTypes: DocumentType[],
 }
 
 // Additional request body data that we send along to the chat API.
diff --git a/packages/web/src/features/chat/utils.server.ts b/packages/web/src/features/chat/utils.server.ts
index 7ec47b677..0b04226d8 100644
--- a/packages/web/src/features/chat/utils.server.ts
+++ b/packages/web/src/features/chat/utils.server.ts
@@ -7,7 +7,7 @@ import { env, loadConfig } from '@sourcebot/shared';
 import fs from 'fs';
 import path from 'path';
 import { LanguageModelInfo, SBChatMessage } from './types';
-import { resolveModelInputModalities } from './modelCapabilities';
+import { resolveModelInputModalities, resolveModelSupportedDocumentTypes } from './modelCapabilities';
 import { hasEntitlement } from '@/lib/entitlements';
 import { ServiceError } from '@/lib/serviceError';
 import { ErrorCode } from '@/lib/errorCodes';
@@ -133,5 +133,6 @@ export const getConfiguredLanguageModelsInfo = async () => {
         model: model.model,
         displayName: model.displayName,
         inputModalities: resolveModelInputModalities(model),
+        supportedDocumentTypes: resolveModelSupportedDocumentTypes(model),
     }));
 };
diff --git a/schemas/v3/languageModel.json b/schemas/v3/languageModel.json
index 0fb96217a..e49707484 100644
--- a/schemas/v3/languageModel.json
+++ b/schemas/v3/languageModel.json
@@ -57,10 +57,20 @@
                         "enum": [
                             "text",
                             "image",
+                            "audio",
+                            "video"
+                        ]
+                    },
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                },
+                "supportedDocumentTypes": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
                 }
             },
             "required": [
@@ -111,10 +121,20 @@
                         "enum": [
                             "text",
                             "image",
+                            "audio",
+                            "video"
+                        ]
+                    },
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                },
+                "supportedDocumentTypes": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
                 }
             },
             "required": [
@@ -189,10 +209,20 @@
                         "enum": [
                             "text",
                             "image",
+                            "audio",
+                            "video"
+                        ]
+                    },
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                },
+                "supportedDocumentTypes": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
                 }
             },
             "required": [
@@ -239,10 +269,20 @@
                         "enum": [
                             "text",
                             "image",
+                            "audio",
+                            "video"
+                        ]
+                    },
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                },
+                "supportedDocumentTypes": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
                 }
             },
             "required": [
@@ -303,10 +343,20 @@
                         "enum": [
                             "text",
                             "image",
+                            "audio",
+                            "video"
+                        ]
+                    },
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                },
+                "supportedDocumentTypes": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
                 }
             },
             "required": [
@@ -369,10 +419,20 @@
                         "enum": [
                             "text",
                             "image",
+                            "audio",
+                            "video"
+                        ]
+                    },
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                },
+                "supportedDocumentTypes": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
                 }
             },
             "required": [
@@ -451,10 +511,20 @@
                         "enum": [
                             "text",
                             "image",
+                            "audio",
+                            "video"
+                        ]
+                    },
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                },
+                "supportedDocumentTypes": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
                 }
             },
             "required": [
@@ -501,10 +571,20 @@
                         "enum": [
                             "text",
                             "image",
+                            "audio",
+                            "video"
+                        ]
+                    },
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                },
+                "supportedDocumentTypes": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
                 }
             },
             "required": [
@@ -577,10 +657,20 @@
                         "enum": [
                             "text",
                             "image",
+                            "audio",
+                            "video"
+                        ]
+                    },
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                },
+                "supportedDocumentTypes": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
                 }
             },
             "required": [
@@ -643,10 +733,20 @@
                         "enum": [
                             "text",
                             "image",
+                            "audio",
+                            "video"
+                        ]
+                    },
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                },
+                "supportedDocumentTypes": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
                 }
             },
             "required": [
@@ -694,10 +794,20 @@
                         "enum": [
                             "text",
                             "image",
+                            "audio",
+                            "video"
+                        ]
+                    },
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                },
+                "supportedDocumentTypes": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
                 }
             },
             "required": [
@@ -748,10 +858,20 @@
                         "enum": [
                             "text",
                             "image",
+                            "audio",
+                            "video"
+                        ]
+                    },
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                },
+                "supportedDocumentTypes": {
+                    "type": "array",
+                    "items": {
+                        "enum": [
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of input modalities this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
                 }
             },
             "required": [

From 0baabcba43e86432a1a69846d90a47c83b62499f Mon Sep 17 00:00:00 2001
From: whoisthey <joedimagio23@proton.me>
Date: Fri, 26 Jun 2026 10:17:14 -0700
Subject: [PATCH 04/10] docs(schemas): clarify what counts as a document type

Tighten the inputModalities / supportedDocumentTypes descriptions to
remove the implication that omitting supportedDocumentTypes blocks all
non-text attachments. Clarify the taxonomy: single-medium files
(images, audio, video) and plain-text files (.txt, .md) are governed by
inputModalities; supportedDocumentTypes only gates rich compound
container formats like PDF.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 docs/snippets/schemas/v3/index.schema.mdx     | 96 +++++++++----------
 .../schemas/v3/languageModel.schema.mdx       | 96 +++++++++----------
 packages/schemas/src/v3/index.schema.ts       | 96 +++++++++----------
 packages/schemas/src/v3/index.type.ts         | 48 +++++-----
 .../schemas/src/v3/languageModel.schema.ts    | 96 +++++++++----------
 packages/schemas/src/v3/languageModel.type.ts | 48 +++++-----
 packages/web/src/features/chat/types.ts       |  4 +-
 schemas/v3/languageModel.json                 | 48 +++++-----
 8 files changed, 266 insertions(+), 266 deletions(-)

diff --git a/docs/snippets/schemas/v3/index.schema.mdx b/docs/snippets/schemas/v3/index.schema.mdx
index 5b099d724..356da2009 100644
--- a/docs/snippets/schemas/v3/index.schema.mdx
+++ b/docs/snippets/schemas/v3/index.schema.mdx
@@ -1871,7 +1871,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -1880,7 +1880,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2030,7 +2030,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -2039,7 +2039,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2186,7 +2186,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -2195,7 +2195,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2314,7 +2314,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -2323,7 +2323,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2456,7 +2456,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -2465,7 +2465,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2600,7 +2600,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -2609,7 +2609,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2760,7 +2760,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -2769,7 +2769,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2888,7 +2888,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -2897,7 +2897,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3042,7 +3042,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -3051,7 +3051,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3227,7 +3227,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -3236,7 +3236,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3356,7 +3356,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -3365,7 +3365,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3488,7 +3488,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -3497,7 +3497,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3689,7 +3689,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -3698,7 +3698,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3848,7 +3848,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -3857,7 +3857,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4004,7 +4004,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -4013,7 +4013,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4132,7 +4132,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -4141,7 +4141,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4274,7 +4274,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -4283,7 +4283,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4418,7 +4418,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -4427,7 +4427,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4578,7 +4578,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -4587,7 +4587,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4706,7 +4706,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -4715,7 +4715,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4860,7 +4860,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -4869,7 +4869,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -5045,7 +5045,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -5054,7 +5054,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -5174,7 +5174,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -5183,7 +5183,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -5306,7 +5306,7 @@
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -5315,7 +5315,7 @@
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
diff --git a/docs/snippets/schemas/v3/languageModel.schema.mdx b/docs/snippets/schemas/v3/languageModel.schema.mdx
index 7b1e774cf..5af4b3d96 100644
--- a/docs/snippets/schemas/v3/languageModel.schema.mdx
+++ b/docs/snippets/schemas/v3/languageModel.schema.mdx
@@ -185,7 +185,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -194,7 +194,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -344,7 +344,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -353,7 +353,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -500,7 +500,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -509,7 +509,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -628,7 +628,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -637,7 +637,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -770,7 +770,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -779,7 +779,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -914,7 +914,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -923,7 +923,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1074,7 +1074,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -1083,7 +1083,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1202,7 +1202,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -1211,7 +1211,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1356,7 +1356,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -1365,7 +1365,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1541,7 +1541,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -1550,7 +1550,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1670,7 +1670,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -1679,7 +1679,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1802,7 +1802,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -1811,7 +1811,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2003,7 +2003,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -2012,7 +2012,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2162,7 +2162,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -2171,7 +2171,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2318,7 +2318,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -2327,7 +2327,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2446,7 +2446,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -2455,7 +2455,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2588,7 +2588,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -2597,7 +2597,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2732,7 +2732,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -2741,7 +2741,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2892,7 +2892,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -2901,7 +2901,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3020,7 +3020,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -3029,7 +3029,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3174,7 +3174,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -3183,7 +3183,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3359,7 +3359,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -3368,7 +3368,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3488,7 +3488,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -3497,7 +3497,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3620,7 +3620,7 @@
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -3629,7 +3629,7 @@
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
diff --git a/packages/schemas/src/v3/index.schema.ts b/packages/schemas/src/v3/index.schema.ts
index 7d051544c..123fd4a8b 100644
--- a/packages/schemas/src/v3/index.schema.ts
+++ b/packages/schemas/src/v3/index.schema.ts
@@ -1870,7 +1870,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -1879,7 +1879,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2029,7 +2029,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -2038,7 +2038,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2185,7 +2185,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -2194,7 +2194,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2313,7 +2313,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -2322,7 +2322,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2455,7 +2455,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -2464,7 +2464,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2599,7 +2599,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -2608,7 +2608,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2759,7 +2759,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -2768,7 +2768,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2887,7 +2887,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -2896,7 +2896,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3041,7 +3041,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -3050,7 +3050,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3226,7 +3226,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -3235,7 +3235,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3355,7 +3355,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -3364,7 +3364,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3487,7 +3487,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -3496,7 +3496,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3688,7 +3688,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -3697,7 +3697,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3847,7 +3847,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -3856,7 +3856,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4003,7 +4003,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -4012,7 +4012,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4131,7 +4131,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -4140,7 +4140,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4273,7 +4273,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -4282,7 +4282,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4417,7 +4417,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -4426,7 +4426,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4577,7 +4577,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -4586,7 +4586,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4705,7 +4705,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -4714,7 +4714,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4859,7 +4859,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -4868,7 +4868,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -5044,7 +5044,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -5053,7 +5053,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -5173,7 +5173,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -5182,7 +5182,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -5305,7 +5305,7 @@ const schema = {
                     "video"
                   ]
                 },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
               },
               "supportedDocumentTypes": {
                 "type": "array",
@@ -5314,7 +5314,7 @@ const schema = {
                     "pdf"
                   ]
                 },
-                "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
diff --git a/packages/schemas/src/v3/index.type.ts b/packages/schemas/src/v3/index.type.ts
index 14c8c14e2..d6f555e8d 100644
--- a/packages/schemas/src/v3/index.type.ts
+++ b/packages/schemas/src/v3/index.type.ts
@@ -763,11 +763,11 @@ export interface AmazonBedrockLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -851,11 +851,11 @@ export interface AnthropicLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -914,11 +914,11 @@ export interface AzureLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -961,11 +961,11 @@ export interface DeepSeekLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -1016,11 +1016,11 @@ export interface GoogleGenerativeAILanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -1071,11 +1071,11 @@ export interface GoogleVertexAnthropicLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -1134,11 +1134,11 @@ export interface GoogleVertexLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -1181,11 +1181,11 @@ export interface MistralLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -1236,11 +1236,11 @@ export interface OpenAILanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -1288,11 +1288,11 @@ export interface OpenAICompatibleLanguageModel {
    */
   temperature?: number;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -1360,11 +1360,11 @@ export interface OpenRouterLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -1407,11 +1407,11 @@ export interface XaiLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
diff --git a/packages/schemas/src/v3/languageModel.schema.ts b/packages/schemas/src/v3/languageModel.schema.ts
index 9c9ae7b2d..61cc0adf3 100644
--- a/packages/schemas/src/v3/languageModel.schema.ts
+++ b/packages/schemas/src/v3/languageModel.schema.ts
@@ -184,7 +184,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -193,7 +193,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -343,7 +343,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -352,7 +352,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -499,7 +499,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -508,7 +508,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -627,7 +627,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -636,7 +636,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -769,7 +769,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -778,7 +778,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -913,7 +913,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -922,7 +922,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1073,7 +1073,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -1082,7 +1082,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1201,7 +1201,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -1210,7 +1210,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1355,7 +1355,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -1364,7 +1364,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1540,7 +1540,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -1549,7 +1549,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1669,7 +1669,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -1678,7 +1678,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1801,7 +1801,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -1810,7 +1810,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2002,7 +2002,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -2011,7 +2011,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2161,7 +2161,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -2170,7 +2170,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2317,7 +2317,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -2326,7 +2326,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2445,7 +2445,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -2454,7 +2454,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2587,7 +2587,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -2596,7 +2596,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2731,7 +2731,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -2740,7 +2740,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2891,7 +2891,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -2900,7 +2900,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3019,7 +3019,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -3028,7 +3028,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3173,7 +3173,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -3182,7 +3182,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3358,7 +3358,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -3367,7 +3367,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3487,7 +3487,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -3496,7 +3496,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3619,7 +3619,7 @@ const schema = {
               "video"
             ]
           },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
         },
         "supportedDocumentTypes": {
           "type": "array",
@@ -3628,7 +3628,7 @@ const schema = {
               "pdf"
             ]
           },
-          "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
diff --git a/packages/schemas/src/v3/languageModel.type.ts b/packages/schemas/src/v3/languageModel.type.ts
index 3297689b7..90a53b423 100644
--- a/packages/schemas/src/v3/languageModel.type.ts
+++ b/packages/schemas/src/v3/languageModel.type.ts
@@ -89,11 +89,11 @@ export interface AmazonBedrockLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -177,11 +177,11 @@ export interface AnthropicLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -240,11 +240,11 @@ export interface AzureLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -287,11 +287,11 @@ export interface DeepSeekLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -342,11 +342,11 @@ export interface GoogleGenerativeAILanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -397,11 +397,11 @@ export interface GoogleVertexAnthropicLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -460,11 +460,11 @@ export interface GoogleVertexLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -507,11 +507,11 @@ export interface MistralLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -562,11 +562,11 @@ export interface OpenAILanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -614,11 +614,11 @@ export interface OpenAICompatibleLanguageModel {
    */
   temperature?: number;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -686,11 +686,11 @@ export interface OpenRouterLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
@@ -733,11 +733,11 @@ export interface XaiLanguageModel {
   temperature?: number;
   headers?: LanguageModelHeaders;
   /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed).
+   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
    */
   inputModalities?: ("text" | "image" | "audio" | "video")[];
   /**
-   * Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed).
+   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
    */
   supportedDocumentTypes?: "pdf"[];
 }
diff --git a/packages/web/src/features/chat/types.ts b/packages/web/src/features/chat/types.ts
index e1daf0bdb..3547c5d0a 100644
--- a/packages/web/src/features/chat/types.ts
+++ b/packages/web/src/features/chat/types.ts
@@ -215,8 +215,8 @@ export const languageModelInfoSchema = z.object({
     provider: z.enum(languageModelProviders).describe("The model provider (e.g., 'anthropic', 'openai')"),
     model: z.string().describe("The model ID"),
     displayName: z.string().optional().describe("Optional display name for the model"),
-    inputModalities: z.array(z.enum(['text', 'image', 'audio', 'video'])).default(['text']).describe("The input modalities the model can accept. Defaults to text-only."),
-    supportedDocumentTypes: z.array(z.enum(['pdf'])).default([]).describe("The document/file container formats the model can ingest natively. Defaults to none."),
+    inputModalities: z.array(z.enum(['text', 'image', 'audio', 'video'])).default(['text']).describe("The input modalities the model can accept (images, audio, video, text). Single-medium attachments are gated by these. Defaults to text-only."),
+    supportedDocumentTypes: z.array(z.enum(['pdf'])).default([]).describe("Rich compound document formats (e.g. PDF) the model can ingest natively, distinct from single-medium attachments gated by inputModalities. Defaults to none."),
 });
 
 /**
diff --git a/schemas/v3/languageModel.json b/schemas/v3/languageModel.json
index e49707484..a952554b9 100644
--- a/schemas/v3/languageModel.json
+++ b/schemas/v3/languageModel.json
@@ -61,7 +61,7 @@
                             "video"
                         ]
                     },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
                 },
                 "supportedDocumentTypes": {
                     "type": "array",
@@ -70,7 +70,7 @@
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -125,7 +125,7 @@
                             "video"
                         ]
                     },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
                 },
                 "supportedDocumentTypes": {
                     "type": "array",
@@ -134,7 +134,7 @@
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -213,7 +213,7 @@
                             "video"
                         ]
                     },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
                 },
                 "supportedDocumentTypes": {
                     "type": "array",
@@ -222,7 +222,7 @@
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -273,7 +273,7 @@
                             "video"
                         ]
                     },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
                 },
                 "supportedDocumentTypes": {
                     "type": "array",
@@ -282,7 +282,7 @@
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -347,7 +347,7 @@
                             "video"
                         ]
                     },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
                 },
                 "supportedDocumentTypes": {
                     "type": "array",
@@ -356,7 +356,7 @@
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -423,7 +423,7 @@
                             "video"
                         ]
                     },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
                 },
                 "supportedDocumentTypes": {
                     "type": "array",
@@ -432,7 +432,7 @@
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -515,7 +515,7 @@
                             "video"
                         ]
                     },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
                 },
                 "supportedDocumentTypes": {
                     "type": "array",
@@ -524,7 +524,7 @@
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -575,7 +575,7 @@
                             "video"
                         ]
                     },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
                 },
                 "supportedDocumentTypes": {
                     "type": "array",
@@ -584,7 +584,7 @@
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -661,7 +661,7 @@
                             "video"
                         ]
                     },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
                 },
                 "supportedDocumentTypes": {
                     "type": "array",
@@ -670,7 +670,7 @@
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -737,7 +737,7 @@
                             "video"
                         ]
                     },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
                 },
                 "supportedDocumentTypes": {
                     "type": "array",
@@ -746,7 +746,7 @@
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -798,7 +798,7 @@
                             "video"
                         ]
                     },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
                 },
                 "supportedDocumentTypes": {
                     "type": "array",
@@ -807,7 +807,7 @@
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -862,7 +862,7 @@
                             "video"
                         ]
                     },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Used to gate non-text chat attachments. When omitted, the model is treated as text-only (fail-closed)."
+                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
                 },
                 "supportedDocumentTypes": {
                     "type": "array",
@@ -871,7 +871,7 @@
                             "pdf"
                         ]
                     },
-                    "description": "Optional list of document/file container formats (e.g. PDF) the model can ingest natively, distinct from raw input modalities. Used to gate document attachments. When omitted, no document types are supported (fail-closed)."
+                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [

From 5e4045b0ef25d95ac740961d0368b5250e820daf Mon Sep 17 00:00:00 2001
From: whoisthey <joedimagio23@proton.me>
Date: Fri, 26 Jun 2026 10:25:57 -0700
Subject: [PATCH 05/10] fix(web): widen getLanguageModelKey param to keyable
 subset

LanguageModelInfo now has required inputModalities/supportedDocumentTypes,
so a raw LanguageModel config (where those are optional) is no longer
assignable to it. getLanguageModelKey only reads provider/model/displayName,
so type its parameter as that Pick subset, letting both LanguageModel and
LanguageModelInfo be keyed. Fixes the docker build type check.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 packages/web/src/features/chat/utils.ts | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/packages/web/src/features/chat/utils.ts b/packages/web/src/features/chat/utils.ts
index c7f409ac7..b103ada7c 100644
--- a/packages/web/src/features/chat/utils.ts
+++ b/packages/web/src/features/chat/utils.ts
@@ -422,9 +422,11 @@ export const getAnswerPartFromAssistantMessage = (message: SBChatMessage, isTurn
 }
 
 /**
- * Generates a unique key given a LanguageModelInfo object.
+ * Generates a unique key for a language model. Accepts any object carrying the
+ * identifying fields, so both the full `LanguageModel` config and the
+ * client-safe `LanguageModelInfo` can be keyed with it.
  */
-export const getLanguageModelKey = (model: LanguageModelInfo) => {
+export const getLanguageModelKey = (model: Pick<LanguageModelInfo, 'provider' | 'model' | 'displayName'>) => {
     return `${model.provider}-${model.model}-${model.displayName}`;
 }
 

From 507d7586cb2f10f8ae166629cb36af19872d4e6d Mon Sep 17 00:00:00 2001
From: whoisthey <joedimagio23@proton.me>
Date: Fri, 26 Jun 2026 10:45:05 -0700
Subject: [PATCH 06/10] chore(schemas,web): keep schema dist fresh and resolve
 types from source

Two dev-experience fixes for the stale-build-output footgun:

- schemas watch now runs `yarn build` (generate + tsc) instead of
  generate-only, so editing a schema JSON during `yarn dev` refreshes
  dist (both the .d.ts types and the runtime index.schema.js used by
  ajv), not just the generated source.
- web tsconfig maps @sourcebot/schemas/v3|v2/* to the package source,
  so type-checking and the IDE read committed source directly instead
  of stale built .d.ts. Web only imports .type files (erased at
  compile), so there is no bundling/runtime impact.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 packages/schemas/package.json | 2 +-
 packages/web/tsconfig.json    | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/packages/schemas/package.json b/packages/schemas/package.json
index 13fe2cb7a..3719a6da5 100644
--- a/packages/schemas/package.json
+++ b/packages/schemas/package.json
@@ -5,7 +5,7 @@
     "scripts": {
         "build": "yarn generate && tsc",
         "generate": "tsx tools/generate.ts",
-        "watch": "nodemon --watch ../../schemas -e json -x 'yarn generate'",
+        "watch": "nodemon --watch ../../schemas -e json -x 'yarn build'",
         "postinstall": "yarn build"
     },
     "devDependencies": {
diff --git a/packages/web/tsconfig.json b/packages/web/tsconfig.json
index f18162100..3f0e7534b 100644
--- a/packages/web/tsconfig.json
+++ b/packages/web/tsconfig.json
@@ -27,6 +27,12 @@
       ],
       "@/public/*": [
         "./public/*"
+      ],
+      "@sourcebot/schemas/v3/*": [
+        "../schemas/src/v3/*"
+      ],
+      "@sourcebot/schemas/v2/*": [
+        "../schemas/src/v2/*"
       ]
     },
     "target": "ES2017"

From de291cc8ab301b28067812186f534bcba5b6f69e Mon Sep 17 00:00:00 2001
From: whoisthey <joedimagio23@proton.me>
Date: Sat, 27 Jun 2026 11:30:15 -0700
Subject: [PATCH 07/10] refactor(web): resolve model capabilities from
 models.dev, not config.json

Re-source language model input-modality / document capabilities from the
models.dev catalog instead of hand-declared config.json fields, aligning
with the move to de-emphasize on-disk config in favor of automatic
resolution (the same catalog already backs context-window resolution).

- Revert the inputModalities/supportedDocumentTypes additions to
  schemas/v3/languageModel.json and all regenerated artifacts; capabilities
  are no longer declared in config.json.
- Extract the shared models.dev catalog plumbing (fetch/TTL/negative-cache/
  stale-while-revalidate/provider-id overrides) into modelsDevCatalog.server.ts,
  now consumed by both context-window and capability resolution.
- Add models.dev-backed resolveModelCapabilities (modelCapabilities.server.ts),
  partitioning the catalog's modalities.input list into Sourcebot's
  inputModalities (channels) and supportedDocumentTypes (containers); falls back
  to text-only for uncatalogued / self-hosted models.

The client-safe LanguageModelInfo contract is unchanged; only the resolution
backend moved.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 CHANGELOG.md                                  |   2 +-
 docs/snippets/schemas/v3/index.schema.mdx     | 504 ------------------
 .../schemas/v3/languageModel.schema.mdx       | 504 ------------------
 packages/schemas/src/v3/index.schema.ts       | 504 ------------------
 packages/schemas/src/v3/index.type.ts         |  96 ----
 .../schemas/src/v3/languageModel.schema.ts    | 504 ------------------
 packages/schemas/src/v3/languageModel.type.ts |  96 ----
 .../web/src/ee/features/mcp/askCodebase.ts    |   7 +-
 .../chat/modelCapabilities.server.test.ts     | 126 +++++
 .../features/chat/modelCapabilities.server.ts |  64 +++
 .../src/features/chat/modelCapabilities.ts    |  24 -
 .../chat/modelContextWindow.server.ts         |  99 +---
 .../features/chat/modelsDevCatalog.server.ts  | 111 ++++
 .../web/src/features/chat/utils.server.ts     |  17 +-
 schemas/v3/languageModel.json                 | 254 +--------
 15 files changed, 322 insertions(+), 2590 deletions(-)
 create mode 100644 packages/web/src/features/chat/modelCapabilities.server.test.ts
 create mode 100644 packages/web/src/features/chat/modelCapabilities.server.ts
 delete mode 100644 packages/web/src/features/chat/modelCapabilities.ts
 create mode 100644 packages/web/src/features/chat/modelsDevCatalog.server.ts

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 27408c782..68e63d675 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,7 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added per-step token cost tracking and estimated tool call token usage to Ask Sourcebot chat history. [#1353](https://github.com/sourcebot-dev/sourcebot/pull/1353)
 - [EE] Added mermaid diagram rendering to Ask Sourcebot answers, with pan/zoom, copy/export, in-thread deep links, and an interleaved right-panel view. [#1369](https://github.com/sourcebot-dev/sourcebot/pull/1369)
 - [EE] Added a context-window usage gauge to the Ask Sourcebot chat details, showing how much of the selected model's context window each turn occupies. Window sizes are resolved from the models.dev catalog. [#1370](https://github.com/sourcebot-dev/sourcebot/pull/1370)
-- Added optional `inputModalities` and `supportedDocumentTypes` configuration for language models, exposing model input-modality and document capabilities (defaults to text-only, no documents). [#1372](https://github.com/sourcebot-dev/sourcebot/pull/1372)
+- Added language model input-modality and document capability resolution, automatically resolved from the models.dev catalog (falls back to text-only for uncatalogued/self-hosted models). [#1372](https://github.com/sourcebot-dev/sourcebot/pull/1372)
 
 ### Fixed
 - Send anonymous server-side PostHog events as personless so unauthenticated requests don't inflate person counts. [#1367](https://github.com/sourcebot-dev/sourcebot/pull/1367)
diff --git a/docs/snippets/schemas/v3/index.schema.mdx b/docs/snippets/schemas/v3/index.schema.mdx
index 356da2009..864359251 100644
--- a/docs/snippets/schemas/v3/index.schema.mdx
+++ b/docs/snippets/schemas/v3/index.schema.mdx
@@ -1860,27 +1860,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2019,27 +1998,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2175,27 +2133,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2303,27 +2240,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2445,27 +2361,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2589,27 +2484,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2749,27 +2623,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2877,27 +2730,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3031,27 +2863,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3216,27 +3027,6 @@
               "temperature": {
                 "type": "number",
                 "description": "Optional temperature setting to use with the model."
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3345,27 +3135,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3477,27 +3246,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3678,27 +3426,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3837,27 +3564,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3993,27 +3699,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4121,27 +3806,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4263,27 +3927,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4407,27 +4050,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4567,27 +4189,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4695,27 +4296,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4849,27 +4429,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -5034,27 +4593,6 @@
               "temperature": {
                 "type": "number",
                 "description": "Optional temperature setting to use with the model."
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -5163,27 +4701,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -5295,27 +4812,6 @@
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
diff --git a/docs/snippets/schemas/v3/languageModel.schema.mdx b/docs/snippets/schemas/v3/languageModel.schema.mdx
index 5af4b3d96..90aee08af 100644
--- a/docs/snippets/schemas/v3/languageModel.schema.mdx
+++ b/docs/snippets/schemas/v3/languageModel.schema.mdx
@@ -174,27 +174,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -333,27 +312,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -489,27 +447,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -617,27 +554,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -759,27 +675,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -903,27 +798,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1063,27 +937,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1191,27 +1044,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1345,27 +1177,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1530,27 +1341,6 @@
         "temperature": {
           "type": "number",
           "description": "Optional temperature setting to use with the model."
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1659,27 +1449,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1791,27 +1560,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1992,27 +1740,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2151,27 +1878,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2307,27 +2013,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2435,27 +2120,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2577,27 +2241,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2721,27 +2364,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2881,27 +2503,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3009,27 +2610,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3163,27 +2743,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3348,27 +2907,6 @@
         "temperature": {
           "type": "number",
           "description": "Optional temperature setting to use with the model."
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3477,27 +3015,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3609,27 +3126,6 @@
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
diff --git a/packages/schemas/src/v3/index.schema.ts b/packages/schemas/src/v3/index.schema.ts
index 123fd4a8b..8c1d64b52 100644
--- a/packages/schemas/src/v3/index.schema.ts
+++ b/packages/schemas/src/v3/index.schema.ts
@@ -1859,27 +1859,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2018,27 +1997,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2174,27 +2132,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2302,27 +2239,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2444,27 +2360,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2588,27 +2483,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2748,27 +2622,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -2876,27 +2729,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3030,27 +2862,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3215,27 +3026,6 @@ const schema = {
               "temperature": {
                 "type": "number",
                 "description": "Optional temperature setting to use with the model."
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3344,27 +3134,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3476,27 +3245,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3677,27 +3425,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3836,27 +3563,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -3992,27 +3698,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4120,27 +3805,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4262,27 +3926,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4406,27 +4049,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4566,27 +4188,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4694,27 +4295,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -4848,27 +4428,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -5033,27 +4592,6 @@ const schema = {
               "temperature": {
                 "type": "number",
                 "description": "Optional temperature setting to use with the model."
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -5162,27 +4700,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
@@ -5294,27 +4811,6 @@ const schema = {
                   }
                 },
                 "additionalProperties": false
-              },
-              "inputModalities": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "text",
-                    "image",
-                    "audio",
-                    "video"
-                  ]
-                },
-                "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-              },
-              "supportedDocumentTypes": {
-                "type": "array",
-                "items": {
-                  "enum": [
-                    "pdf"
-                  ]
-                },
-                "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
               }
             },
             "required": [
diff --git a/packages/schemas/src/v3/index.type.ts b/packages/schemas/src/v3/index.type.ts
index d6f555e8d..7fa7f5a17 100644
--- a/packages/schemas/src/v3/index.type.ts
+++ b/packages/schemas/src/v3/index.type.ts
@@ -762,14 +762,6 @@ export interface AmazonBedrockLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 /**
  * Optional headers to use with the model.
@@ -850,14 +842,6 @@ export interface AnthropicLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 export interface AzureLanguageModel {
   /**
@@ -913,14 +897,6 @@ export interface AzureLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 export interface DeepSeekLanguageModel {
   /**
@@ -960,14 +936,6 @@ export interface DeepSeekLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 export interface GoogleGenerativeAILanguageModel {
   /**
@@ -1015,14 +983,6 @@ export interface GoogleGenerativeAILanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 export interface GoogleVertexAnthropicLanguageModel {
   /**
@@ -1070,14 +1030,6 @@ export interface GoogleVertexAnthropicLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 export interface GoogleVertexLanguageModel {
   /**
@@ -1133,14 +1085,6 @@ export interface GoogleVertexLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 export interface MistralLanguageModel {
   /**
@@ -1180,14 +1124,6 @@ export interface MistralLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 export interface OpenAILanguageModel {
   /**
@@ -1235,14 +1171,6 @@ export interface OpenAILanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 export interface OpenAICompatibleLanguageModel {
   /**
@@ -1287,14 +1215,6 @@ export interface OpenAICompatibleLanguageModel {
    * Optional temperature setting to use with the model.
    */
   temperature?: number;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 /**
  * Optional query parameters to include in the request url.
@@ -1359,14 +1279,6 @@ export interface OpenRouterLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 export interface XaiLanguageModel {
   /**
@@ -1406,14 +1318,6 @@ export interface XaiLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 export interface GitHubAppConfig {
   /**
diff --git a/packages/schemas/src/v3/languageModel.schema.ts b/packages/schemas/src/v3/languageModel.schema.ts
index 61cc0adf3..ab418ce79 100644
--- a/packages/schemas/src/v3/languageModel.schema.ts
+++ b/packages/schemas/src/v3/languageModel.schema.ts
@@ -173,27 +173,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -332,27 +311,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -488,27 +446,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -616,27 +553,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -758,27 +674,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -902,27 +797,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1062,27 +936,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1190,27 +1043,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1344,27 +1176,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1529,27 +1340,6 @@ const schema = {
         "temperature": {
           "type": "number",
           "description": "Optional temperature setting to use with the model."
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1658,27 +1448,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1790,27 +1559,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -1991,27 +1739,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2150,27 +1877,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2306,27 +2012,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2434,27 +2119,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2576,27 +2240,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2720,27 +2363,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -2880,27 +2502,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3008,27 +2609,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3162,27 +2742,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3347,27 +2906,6 @@ const schema = {
         "temperature": {
           "type": "number",
           "description": "Optional temperature setting to use with the model."
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3476,27 +3014,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
@@ -3608,27 +3125,6 @@ const schema = {
             }
           },
           "additionalProperties": false
-        },
-        "inputModalities": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "text",
-              "image",
-              "audio",
-              "video"
-            ]
-          },
-          "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-        },
-        "supportedDocumentTypes": {
-          "type": "array",
-          "items": {
-            "enum": [
-              "pdf"
-            ]
-          },
-          "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
         }
       },
       "required": [
diff --git a/packages/schemas/src/v3/languageModel.type.ts b/packages/schemas/src/v3/languageModel.type.ts
index 90a53b423..5c3b25668 100644
--- a/packages/schemas/src/v3/languageModel.type.ts
+++ b/packages/schemas/src/v3/languageModel.type.ts
@@ -88,14 +88,6 @@ export interface AmazonBedrockLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 /**
  * Optional headers to use with the model.
@@ -176,14 +168,6 @@ export interface AnthropicLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 export interface AzureLanguageModel {
   /**
@@ -239,14 +223,6 @@ export interface AzureLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 export interface DeepSeekLanguageModel {
   /**
@@ -286,14 +262,6 @@ export interface DeepSeekLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 export interface GoogleGenerativeAILanguageModel {
   /**
@@ -341,14 +309,6 @@ export interface GoogleGenerativeAILanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 export interface GoogleVertexAnthropicLanguageModel {
   /**
@@ -396,14 +356,6 @@ export interface GoogleVertexAnthropicLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 export interface GoogleVertexLanguageModel {
   /**
@@ -459,14 +411,6 @@ export interface GoogleVertexLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 export interface MistralLanguageModel {
   /**
@@ -506,14 +450,6 @@ export interface MistralLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 export interface OpenAILanguageModel {
   /**
@@ -561,14 +497,6 @@ export interface OpenAILanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 export interface OpenAICompatibleLanguageModel {
   /**
@@ -613,14 +541,6 @@ export interface OpenAICompatibleLanguageModel {
    * Optional temperature setting to use with the model.
    */
   temperature?: number;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 /**
  * Optional query parameters to include in the request url.
@@ -685,14 +605,6 @@ export interface OpenRouterLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
 export interface XaiLanguageModel {
   /**
@@ -732,12 +644,4 @@ export interface XaiLanguageModel {
    */
   temperature?: number;
   headers?: LanguageModelHeaders;
-  /**
-   * Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed).
-   */
-  inputModalities?: ("text" | "image" | "audio" | "video")[];
-  /**
-   * Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`.
-   */
-  supportedDocumentTypes?: "pdf"[];
 }
diff --git a/packages/web/src/ee/features/mcp/askCodebase.ts b/packages/web/src/ee/features/mcp/askCodebase.ts
index 59f8a35ec..35337d29f 100644
--- a/packages/web/src/ee/features/mcp/askCodebase.ts
+++ b/packages/web/src/ee/features/mcp/askCodebase.ts
@@ -5,7 +5,7 @@ import { getAISDKLanguageModelAndOptions } from "@/features/chat/llm.server";
 import { resolveContextWindow } from "@/features/chat/modelContextWindow.server";
 import { LanguageModelInfo, SBChatMessage, SearchScope } from "@/features/chat/types";
 import { convertLLMOutputToPortableMarkdown, getAnswerPartFromAssistantMessage, getLanguageModelKey } from "@/features/chat/utils";
-import { resolveModelInputModalities, resolveModelSupportedDocumentTypes } from "@/features/chat/modelCapabilities";
+import { resolveModelCapabilities } from "@/features/chat/modelCapabilities.server";
 import { ErrorCode } from "@/lib/errorCodes";
 import { ServiceError, ServiceErrorException } from "@/lib/serviceError";
 import { withOptionalAuth } from "@/middleware/withAuth";
@@ -87,6 +87,7 @@ export const askCodebase = (params: AskCodebaseParams): Promise<AskCodebaseResul
             const { model, providerOptions, temperature } = await getAISDKLanguageModelAndOptions(languageModelConfig);
             const modelName = languageModelConfig.displayName ?? languageModelConfig.model;
             const contextWindow = await resolveContextWindow(languageModelConfig);
+            const { inputModalities, supportedDocumentTypes } = await resolveModelCapabilities(languageModelConfig);
 
             // No-op for non-Anthropic providers / when caching is disabled.
             const promptCacheStrategy = getPromptCacheStrategy(
@@ -247,8 +248,8 @@ export const askCodebase = (params: AskCodebaseParams): Promise<AskCodebaseResul
                     provider: languageModelConfig.provider,
                     model: languageModelConfig.model,
                     displayName: languageModelConfig.displayName,
-                    inputModalities: resolveModelInputModalities(languageModelConfig),
-                    supportedDocumentTypes: resolveModelSupportedDocumentTypes(languageModelConfig),
+                    inputModalities,
+                    supportedDocumentTypes,
                 },
             } satisfies AskCodebaseResult;
         })
diff --git a/packages/web/src/features/chat/modelCapabilities.server.test.ts b/packages/web/src/features/chat/modelCapabilities.server.test.ts
new file mode 100644
index 000000000..0a2e9a2ec
--- /dev/null
+++ b/packages/web/src/features/chat/modelCapabilities.server.test.ts
@@ -0,0 +1,126 @@
+import { afterEach, describe, expect, test, vi } from 'vitest';
+import type { LanguageModel } from '@sourcebot/schemas/v3/languageModel.type';
+
+vi.mock('server-only', () => ({ default: vi.fn() }));
+
+vi.mock('@sourcebot/shared', () => ({
+    createLogger: () => ({
+        info: vi.fn(),
+        warn: vi.fn(),
+        error: vi.fn(),
+        debug: vi.fn(),
+    }),
+}));
+
+import { lookupModelCapabilities, resolveModelCapabilities } from './modelCapabilities.server';
+import type { ModelsDevCatalog } from './modelsDevCatalog.server';
+
+const catalog: ModelsDevCatalog = {
+    anthropic: {
+        id: 'anthropic',
+        models: {
+            // Text + image + a document (pdf) container format.
+            'claude-sonnet-4-5': {
+                id: 'claude-sonnet-4-5',
+                modalities: { input: ['text', 'image', 'pdf'], output: ['text'] },
+            },
+        },
+    },
+    // models.dev keys Gemini under 'google', whereas Sourcebot's provider id is
+    // 'google-generative-ai' — exercises the provider id override.
+    google: {
+        id: 'google',
+        models: {
+            'gemini-2.5-pro': {
+                id: 'gemini-2.5-pro',
+                modalities: { input: ['text', 'image', 'audio', 'video', 'pdf'], output: ['text'] },
+            },
+        },
+    },
+    openai: {
+        id: 'openai',
+        models: {
+            // Catalogued model that omits `text` from its input list.
+            'image-only': { id: 'image-only', modalities: { input: ['image'], output: ['text'] } },
+            // Catalogued model with no `modalities` object at all.
+            'no-modalities-model': { id: 'no-modalities-model' },
+        },
+    },
+};
+
+const model = (provider: string, modelId: string) =>
+    ({ provider, model: modelId }) as Pick<LanguageModel, 'provider' | 'model'>;
+
+describe('lookupModelCapabilities', () => {
+    test('splits modalities and document types for a direct provider/model hit', () => {
+        expect(lookupModelCapabilities(catalog, model('anthropic', 'claude-sonnet-4-5'))).toEqual({
+            inputModalities: ['text', 'image'],
+            supportedDocumentTypes: ['pdf'],
+        });
+    });
+
+    test('maps google-generative-ai to the catalog\'s google key', () => {
+        expect(lookupModelCapabilities(catalog, model('google-generative-ai', 'gemini-2.5-pro'))).toEqual({
+            inputModalities: ['text', 'image', 'audio', 'video'],
+            supportedDocumentTypes: ['pdf'],
+        });
+    });
+
+    test('always includes text even when the catalog omits it', () => {
+        expect(lookupModelCapabilities(catalog, model('openai', 'image-only'))).toEqual({
+            inputModalities: ['text', 'image'],
+            supportedDocumentTypes: [],
+        });
+    });
+
+    test('falls back to text-only for a catalogued model with no modalities', () => {
+        expect(lookupModelCapabilities(catalog, model('openai', 'no-modalities-model'))).toEqual({
+            inputModalities: ['text'],
+            supportedDocumentTypes: [],
+        });
+    });
+
+    test('falls back to text-only for an uncatalogued model (e.g. openai-compatible / self-hosted)', () => {
+        expect(lookupModelCapabilities(catalog, model('openai-compatible', 'my-local-model'))).toEqual({
+            inputModalities: ['text'],
+            supportedDocumentTypes: [],
+        });
+        expect(lookupModelCapabilities(catalog, model('anthropic', 'claude-unknown'))).toEqual({
+            inputModalities: ['text'],
+            supportedDocumentTypes: [],
+        });
+    });
+
+    test('falls back to text-only when the catalog is null (fetch failed / unreachable)', () => {
+        expect(lookupModelCapabilities(null, model('anthropic', 'claude-sonnet-4-5'))).toEqual({
+            inputModalities: ['text'],
+            supportedDocumentTypes: [],
+        });
+    });
+});
+
+describe('resolveModelCapabilities', () => {
+    afterEach(() => {
+        vi.unstubAllGlobals();
+    });
+
+    test('fetches the catalog once and resolves capabilities (incl. provider mapping)', async () => {
+        const fetchMock = vi.fn(async () => ({
+            ok: true,
+            json: async () => catalog,
+        }) as unknown as Response);
+        vi.stubGlobal('fetch', fetchMock);
+
+        expect(await resolveModelCapabilities(model('anthropic', 'claude-sonnet-4-5'))).toEqual({
+            inputModalities: ['text', 'image'],
+            supportedDocumentTypes: ['pdf'],
+        });
+        // Subsequent lookups reuse the cached catalog rather than refetching.
+        expect(await resolveModelCapabilities(model('google-generative-ai', 'gemini-2.5-pro'))).toEqual({
+            inputModalities: ['text', 'image', 'audio', 'video'],
+            supportedDocumentTypes: ['pdf'],
+        });
+
+        expect(fetchMock).toHaveBeenCalledTimes(1);
+    });
+});
diff --git a/packages/web/src/features/chat/modelCapabilities.server.ts b/packages/web/src/features/chat/modelCapabilities.server.ts
new file mode 100644
index 000000000..87d2cb131
--- /dev/null
+++ b/packages/web/src/features/chat/modelCapabilities.server.ts
@@ -0,0 +1,64 @@
+import 'server-only';
+
+import { LanguageModel } from '@sourcebot/schemas/v3/languageModel.type';
+import { DocumentType, InputModality } from './types';
+import { loadCatalog, resolveProviderId, type ModelsDevCatalog } from './modelsDevCatalog.server';
+
+// models.dev folds every accepted input — perceptual channels (text, image,
+// audio, video) AND container formats (pdf) — into a single `modalities.input`
+// list. Sourcebot keeps those two concepts apart: `inputModalities` are the
+// raw channels a model encodes, while `supportedDocumentTypes` are rich
+// compound formats providers decompose server-side. We partition the catalog's
+// input list into those two buckets here.
+const INPUT_MODALITY_VALUES = ['text', 'image', 'audio', 'video'] as const satisfies readonly InputModality[];
+const DOCUMENT_TYPE_VALUES = ['pdf'] as const satisfies readonly DocumentType[];
+
+const isInputModality = (value: string): value is InputModality =>
+    (INPUT_MODALITY_VALUES as readonly string[]).includes(value);
+
+const isDocumentType = (value: string): value is DocumentType =>
+    (DOCUMENT_TYPE_VALUES as readonly string[]).includes(value);
+
+export type ModelCapabilities = {
+    inputModalities: InputModality[];
+    supportedDocumentTypes: DocumentType[];
+};
+
+/**
+ * Pure lookup of a model's input capabilities in a models.dev catalog.
+ * Separated from the network fetch so it can be unit-tested directly.
+ *
+ * Resolution is automatic from the catalog — capabilities are NOT hand-declared
+ * in config.json. When a model isn't catalogued (e.g. a self-hosted /
+ * openai-compatible endpoint we can't introspect), we fall back to text-only
+ * with no document support: the model stays fully usable for normal chat, and
+ * richer attachments stay gated off until we can positively confirm support.
+ */
+export const lookupModelCapabilities = (
+    catalog: ModelsDevCatalog | null,
+    config: Pick<LanguageModel, 'provider' | 'model'>,
+): ModelCapabilities => {
+    const providerId = resolveProviderId(config.provider);
+    const inputs = catalog?.[providerId]?.models?.[config.model]?.modalities?.input;
+
+    if (!inputs || inputs.length === 0) {
+        return { inputModalities: ['text'], supportedDocumentTypes: [] };
+    }
+
+    const inputModalities = inputs.filter(isInputModality);
+    const supportedDocumentTypes = inputs.filter(isDocumentType);
+
+    // Every model accepts text, even if the catalog omits it from the list.
+    if (!inputModalities.includes('text')) {
+        inputModalities.unshift('text');
+    }
+
+    return { inputModalities, supportedDocumentTypes };
+};
+
+export const resolveModelCapabilities = async (
+    config: Pick<LanguageModel, 'provider' | 'model'>,
+): Promise<ModelCapabilities> => {
+    const catalog = await loadCatalog();
+    return lookupModelCapabilities(catalog, config);
+};
diff --git a/packages/web/src/features/chat/modelCapabilities.ts b/packages/web/src/features/chat/modelCapabilities.ts
deleted file mode 100644
index 8b976af59..000000000
--- a/packages/web/src/features/chat/modelCapabilities.ts
+++ /dev/null
@@ -1,24 +0,0 @@
-import { LanguageModel } from '@sourcebot/schemas/v3/languageModel.type';
-import { DocumentType, InputModality } from './types';
-
-// Fail-closed: when a model does not declare input modalities, assume text-only.
-// NOTE: future work may add live provider capability probing (see
-// tryResolveAnthropicThinkingConfig in llm.server.ts for the precedent).
-export const resolveModelInputModalities = (config: LanguageModel): InputModality[] => {
-    const declared = config.inputModalities;
-    if (declared && declared.length > 0) {
-        return declared;
-    }
-    return ['text'];
-}
-
-// Fail-closed: when a model does not declare supported document types, assume none.
-// Document types (e.g. PDF) are container formats distinct from raw input
-// modalities, since providers decompose them into text/image internally.
-export const resolveModelSupportedDocumentTypes = (config: LanguageModel): DocumentType[] => {
-    const declared = config.supportedDocumentTypes;
-    if (declared && declared.length > 0) {
-        return declared;
-    }
-    return [];
-}
diff --git a/packages/web/src/features/chat/modelContextWindow.server.ts b/packages/web/src/features/chat/modelContextWindow.server.ts
index 0e70dc04f..f87bbcf3b 100644
--- a/packages/web/src/features/chat/modelContextWindow.server.ts
+++ b/packages/web/src/features/chat/modelContextWindow.server.ts
@@ -1,100 +1,11 @@
 import 'server-only';
 
 import { LanguageModel } from '@sourcebot/schemas/v3/languageModel.type';
-import { createLogger } from '@sourcebot/shared';
+import { loadCatalog, resolveProviderId, type ModelsDevCatalog } from './modelsDevCatalog.server';
 
-const logger = createLogger('model-context-window');
-
-// The same public, unauthenticated catalog the setup wizard already consumes
-// (see packages/setupWizard/src/models.ts). Each model entry exposes a
-// `limit.context` field holding the total context window in tokens.
-const MODELS_DEV_API_URL = 'https://models.dev/api.json';
-const FETCH_TIMEOUT_MS = 8000;
-// Re-fetch the (~2.4 MB) catalog at most once per this interval per server
-// process. New models trickle in daily; a stale window for a few hours is fine.
-const CATALOG_TTL_MS = 6 * 60 * 60 * 1000;
-// After a failed fetch, don't reattempt for this long. Without it, an outage in
-// models.dev would make every chat send pay the fetch timeout on the request path.
-const NEGATIVE_CACHE_MS = 60 * 1000;
-
-// Sourcebot provider id -> models.dev top-level catalog key. Only providers
-// whose Sourcebot id differs from the models.dev id need an entry; everything
-// else (anthropic, openai, azure, amazon-bedrock, mistral, deepseek, xai,
-// openrouter, google-vertex, google-vertex-anthropic) matches 1:1.
-const PROVIDER_ID_OVERRIDES: Record<string, string> = {
-    'google-generative-ai': 'google',
-};
-
-type ModelsDevModel = {
-    id: string;
-    limit?: {
-        context?: number;
-        output?: number;
-    };
-};
-
-type ModelsDevProvider = {
-    id: string;
-    models?: Record<string, ModelsDevModel>;
-};
-
-export type ModelsDevCatalog = Record<string, ModelsDevProvider>;
-
-// Last successfully-fetched catalog. Served while fresh, and kept as a fallback
-// when a later refresh fails. `catalogFetchedAt` is when it was fetched (TTL),
-// `lastFailedAt` the most recent fetch failure (negative-cache backoff), and
-// `inFlightFetch` dedupes concurrent fetches.
-let cachedCatalog: ModelsDevCatalog | null = null;
-let catalogFetchedAt = 0;
-let lastFailedAt = 0;
-let inFlightFetch: Promise<ModelsDevCatalog | null> | null = null;
-
-const fetchCatalog = async (): Promise<ModelsDevCatalog | null> => {
-    try {
-        const response = await fetch(MODELS_DEV_API_URL, {
-            signal: AbortSignal.timeout(FETCH_TIMEOUT_MS),
-        });
-        if (!response.ok) {
-            logger.warn(`Failed to fetch models.dev catalog: ${response.status} ${response.statusText}`);
-            return null;
-        }
-        return await response.json() as ModelsDevCatalog;
-    } catch (error) {
-        logger.warn(`Failed to fetch models.dev catalog: ${error}`);
-        return null;
-    }
-};
-
-const loadCatalog = async (): Promise<ModelsDevCatalog | null> => {
-    const now = Date.now();
-    const isFresh = cachedCatalog !== null && now - catalogFetchedAt <= CATALOG_TTL_MS;
-    const isBackingOff = now - lastFailedAt < NEGATIVE_CACHE_MS;
-
-    // Kick off a (deduped) refresh when the cache is stale/empty and we're not
-    // within the post-failure backoff window. On success it replaces the cache;
-    // on failure it only records the failure time, leaving the last-known-good
-    // catalog intact.
-    if (!isFresh && !isBackingOff && !inFlightFetch) {
-        inFlightFetch = fetchCatalog().then((catalog) => {
-            if (catalog) {
-                cachedCatalog = catalog;
-                catalogFetchedAt = Date.now();
-            } else {
-                lastFailedAt = Date.now();
-            }
-            inFlightFetch = null;
-            return catalog;
-        });
-    }
-
-    // Once a catalog has loaded once, never block the request path on the
-    // network: serve the last-known-good value (even if stale) and let any
-    // refresh settle in the background. Only the very first load awaits.
-    if (cachedCatalog !== null) {
-        return cachedCatalog;
-    }
-    return inFlightFetch ?? null;
-};
+// Re-exported so existing consumers/tests can keep importing the catalog type
+// from here.
+export type { ModelsDevCatalog } from './modelsDevCatalog.server';
 
 /**
  * Pure lookup of a model's context window in a models.dev catalog. Separated
@@ -110,7 +21,7 @@ export const lookupContextWindow = (
     if (!catalog) {
         return undefined;
     }
-    const providerId = PROVIDER_ID_OVERRIDES[config.provider] ?? config.provider;
+    const providerId = resolveProviderId(config.provider);
     const context = catalog[providerId]?.models?.[config.model]?.limit?.context;
     // `limit` is schema-optional, and models.dev reports a 0 context window for
     // non-text models (image/audio/etc.). Treat both as "unknown" so the UI
diff --git a/packages/web/src/features/chat/modelsDevCatalog.server.ts b/packages/web/src/features/chat/modelsDevCatalog.server.ts
new file mode 100644
index 000000000..8f6b35043
--- /dev/null
+++ b/packages/web/src/features/chat/modelsDevCatalog.server.ts
@@ -0,0 +1,111 @@
+import 'server-only';
+
+import { createLogger } from '@sourcebot/shared';
+
+const logger = createLogger('models-dev-catalog');
+
+// The same public, unauthenticated catalog the setup wizard already consumes
+// (see packages/setupWizard/src/models.ts). Each model entry exposes a
+// `limit.context` field (total context window in tokens) and a `modalities`
+// field describing the inputs/outputs the model supports natively.
+const MODELS_DEV_API_URL = 'https://models.dev/api.json';
+const FETCH_TIMEOUT_MS = 8000;
+// Re-fetch the (~2.4 MB) catalog at most once per this interval per server
+// process. New models trickle in daily; a stale window for a few hours is fine.
+const CATALOG_TTL_MS = 6 * 60 * 60 * 1000;
+// After a failed fetch, don't reattempt for this long. Without it, an outage in
+// models.dev would make every chat send pay the fetch timeout on the request path.
+const NEGATIVE_CACHE_MS = 60 * 1000;
+
+// Sourcebot provider id -> models.dev top-level catalog key. Only providers
+// whose Sourcebot id differs from the models.dev id need an entry; everything
+// else (anthropic, openai, azure, amazon-bedrock, mistral, deepseek, xai,
+// openrouter, google-vertex, google-vertex-anthropic) matches 1:1.
+const PROVIDER_ID_OVERRIDES: Record<string, string> = {
+    'google-generative-ai': 'google',
+};
+
+export const resolveProviderId = (provider: string): string =>
+    PROVIDER_ID_OVERRIDES[provider] ?? provider;
+
+type ModelsDevModel = {
+    id: string;
+    limit?: {
+        context?: number;
+        output?: number;
+    };
+    modalities?: {
+        // e.g. ["text", "image", "pdf", "audio", "video"]
+        input?: string[];
+        output?: string[];
+    };
+};
+
+type ModelsDevProvider = {
+    id: string;
+    models?: Record<string, ModelsDevModel>;
+};
+
+export type ModelsDevCatalog = Record<string, ModelsDevProvider>;
+
+// Last successfully-fetched catalog. Served while fresh, and kept as a fallback
+// when a later refresh fails. `catalogFetchedAt` is when it was fetched (TTL),
+// `lastFailedAt` the most recent fetch failure (negative-cache backoff), and
+// `inFlightFetch` dedupes concurrent fetches.
+let cachedCatalog: ModelsDevCatalog | null = null;
+let catalogFetchedAt = 0;
+let lastFailedAt = 0;
+let inFlightFetch: Promise<ModelsDevCatalog | null> | null = null;
+
+const fetchCatalog = async (): Promise<ModelsDevCatalog | null> => {
+    try {
+        const response = await fetch(MODELS_DEV_API_URL, {
+            signal: AbortSignal.timeout(FETCH_TIMEOUT_MS),
+        });
+        if (!response.ok) {
+            logger.warn(`Failed to fetch models.dev catalog: ${response.status} ${response.statusText}`);
+            return null;
+        }
+        return await response.json() as ModelsDevCatalog;
+    } catch (error) {
+        logger.warn(`Failed to fetch models.dev catalog: ${error}`);
+        return null;
+    }
+};
+
+/**
+ * Returns the cached models.dev catalog, refreshing it in the background when
+ * stale. Only the very first load blocks on the network; thereafter the
+ * last-known-good catalog is served immediately (even if stale) so the request
+ * path never waits on models.dev.
+ */
+export const loadCatalog = async (): Promise<ModelsDevCatalog | null> => {
+    const now = Date.now();
+    const isFresh = cachedCatalog !== null && now - catalogFetchedAt <= CATALOG_TTL_MS;
+    const isBackingOff = now - lastFailedAt < NEGATIVE_CACHE_MS;
+
+    // Kick off a (deduped) refresh when the cache is stale/empty and we're not
+    // within the post-failure backoff window. On success it replaces the cache;
+    // on failure it only records the failure time, leaving the last-known-good
+    // catalog intact.
+    if (!isFresh && !isBackingOff && !inFlightFetch) {
+        inFlightFetch = fetchCatalog().then((catalog) => {
+            if (catalog) {
+                cachedCatalog = catalog;
+                catalogFetchedAt = Date.now();
+            } else {
+                lastFailedAt = Date.now();
+            }
+            inFlightFetch = null;
+            return catalog;
+        });
+    }
+
+    // Once a catalog has loaded once, never block the request path on the
+    // network: serve the last-known-good value (even if stale) and let any
+    // refresh settle in the background. Only the very first load awaits.
+    if (cachedCatalog !== null) {
+        return cachedCatalog;
+    }
+    return inFlightFetch ?? null;
+};
diff --git a/packages/web/src/features/chat/utils.server.ts b/packages/web/src/features/chat/utils.server.ts
index 0b04226d8..a458c41d8 100644
--- a/packages/web/src/features/chat/utils.server.ts
+++ b/packages/web/src/features/chat/utils.server.ts
@@ -7,7 +7,7 @@ import { env, loadConfig } from '@sourcebot/shared';
 import fs from 'fs';
 import path from 'path';
 import { LanguageModelInfo, SBChatMessage } from './types';
-import { resolveModelInputModalities, resolveModelSupportedDocumentTypes } from './modelCapabilities';
+import { resolveModelCapabilities } from './modelCapabilities.server';
 import { hasEntitlement } from '@/lib/entitlements';
 import { ServiceError } from '@/lib/serviceError';
 import { ErrorCode } from '@/lib/errorCodes';
@@ -128,11 +128,14 @@ export const getConfiguredLanguageModels = async (): Promise<LanguageModel[]> =>
  */
 export const getConfiguredLanguageModelsInfo = async () => {
     const models = await getConfiguredLanguageModels();
-    return models.map((model): LanguageModelInfo => ({
-        provider: model.provider,
-        model: model.model,
-        displayName: model.displayName,
-        inputModalities: resolveModelInputModalities(model),
-        supportedDocumentTypes: resolveModelSupportedDocumentTypes(model),
+    return Promise.all(models.map(async (model): Promise<LanguageModelInfo> => {
+        const { inputModalities, supportedDocumentTypes } = await resolveModelCapabilities(model);
+        return {
+            provider: model.provider,
+            model: model.model,
+            displayName: model.displayName,
+            inputModalities,
+            supportedDocumentTypes,
+        };
     }));
 };
diff --git a/schemas/v3/languageModel.json b/schemas/v3/languageModel.json
index a952554b9..3f1d13d52 100644
--- a/schemas/v3/languageModel.json
+++ b/schemas/v3/languageModel.json
@@ -50,27 +50,6 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
-                },
-                "inputModalities": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "text",
-                            "image",
-                            "audio",
-                            "video"
-                        ]
-                    },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-                },
-                "supportedDocumentTypes": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "pdf"
-                        ]
-                    },
-                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -114,27 +93,6 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
-                },
-                "inputModalities": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "text",
-                            "image",
-                            "audio",
-                            "video"
-                        ]
-                    },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-                },
-                "supportedDocumentTypes": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "pdf"
-                        ]
-                    },
-                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -202,27 +160,6 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
-                },
-                "inputModalities": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "text",
-                            "image",
-                            "audio",
-                            "video"
-                        ]
-                    },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-                },
-                "supportedDocumentTypes": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "pdf"
-                        ]
-                    },
-                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -262,27 +199,6 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
-                },
-                "inputModalities": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "text",
-                            "image",
-                            "audio",
-                            "video"
-                        ]
-                    },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-                },
-                "supportedDocumentTypes": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "pdf"
-                        ]
-                    },
-                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -336,27 +252,6 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
-                },
-                "inputModalities": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "text",
-                            "image",
-                            "audio",
-                            "video"
-                        ]
-                    },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-                },
-                "supportedDocumentTypes": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "pdf"
-                        ]
-                    },
-                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -412,27 +307,6 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
-                },
-                "inputModalities": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "text",
-                            "image",
-                            "audio",
-                            "video"
-                        ]
-                    },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-                },
-                "supportedDocumentTypes": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "pdf"
-                        ]
-                    },
-                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -504,27 +378,6 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
-                },
-                "inputModalities": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "text",
-                            "image",
-                            "audio",
-                            "video"
-                        ]
-                    },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-                },
-                "supportedDocumentTypes": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "pdf"
-                        ]
-                    },
-                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -564,27 +417,6 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
-                },
-                "inputModalities": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "text",
-                            "image",
-                            "audio",
-                            "video"
-                        ]
-                    },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-                },
-                "supportedDocumentTypes": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "pdf"
-                        ]
-                    },
-                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -650,27 +482,6 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
-                },
-                "inputModalities": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "text",
-                            "image",
-                            "audio",
-                            "video"
-                        ]
-                    },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-                },
-                "supportedDocumentTypes": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "pdf"
-                        ]
-                    },
-                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -726,27 +537,6 @@
                 "temperature": {
                     "type": "number",
                     "description": "Optional temperature setting to use with the model."
-                },
-                "inputModalities": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "text",
-                            "image",
-                            "audio",
-                            "video"
-                        ]
-                    },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-                },
-                "supportedDocumentTypes": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "pdf"
-                        ]
-                    },
-                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -787,27 +577,6 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
-                },
-                "inputModalities": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "text",
-                            "image",
-                            "audio",
-                            "video"
-                        ]
-                    },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-                },
-                "supportedDocumentTypes": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "pdf"
-                        ]
-                    },
-                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -851,27 +620,6 @@
                 },
                 "headers": {
                     "$ref": "./shared.json#/definitions/LanguageModelHeaders"
-                },
-                "inputModalities": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "text",
-                            "image",
-                            "audio",
-                            "video"
-                        ]
-                    },
-                    "description": "Optional list of input modalities (perceptual channels the model ingests natively) this model can accept. Governs single-medium attachments by their content: images require `image`, audio requires `audio`, video requires `video`, and plain-text files (.txt, .md, source code) count as `text`. Rich container documents like PDF are gated separately via `supportedDocumentTypes`. When omitted, the model is treated as text-only (fail-closed)."
-                },
-                "supportedDocumentTypes": {
-                    "type": "array",
-                    "items": {
-                        "enum": [
-                            "pdf"
-                        ]
-                    },
-                    "description": "Optional list of rich document formats (e.g. PDF) the model can ingest natively. A document here means a compound container format that bundles text with embedded images and layout, which the provider parses server-side — NOT plain-text files (.txt, .md, which count as `text`) and NOT single-medium files (images/audio/video, which are governed by `inputModalities`). When omitted, no document formats are accepted (fail-closed); this does not restrict `text` or the modalities declared in `inputModalities`."
                 }
             },
             "required": [
@@ -919,4 +667,4 @@
             "$ref": "#/definitions/XaiLanguageModel"
         }
     ]
-}
+}
\ No newline at end of file

From bf792601ee6a0da559aed28d354812bc18bc8f5c Mon Sep 17 00:00:00 2001
From: whoisthey <joedimagio23@proton.me>
Date: Sat, 27 Jun 2026 12:57:45 -0700
Subject: [PATCH 08/10] stronger typing for contract

---
 packages/web/src/features/chat/types.ts | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/packages/web/src/features/chat/types.ts b/packages/web/src/features/chat/types.ts
index 50792dcb8..659551d4f 100644
--- a/packages/web/src/features/chat/types.ts
+++ b/packages/web/src/features/chat/types.ts
@@ -209,15 +209,18 @@ type _AssertAllProviders = LanguageModelProvider extends typeof languageModelPro
 const _assertAllProviders: _AssertAllProviders = true;
 void _assertAllProviders;
 
-export type InputModality = 'text' | 'image' | 'audio' | 'video';
-export type DocumentType = 'pdf';
+export const inputModalities = ['text', 'image', 'audio', 'video'] as const;
+export type InputModality = typeof inputModalities[number];
+
+export const documentTypes = ['pdf'] as const;
+export type DocumentType = typeof documentTypes[number];
 
 export const languageModelInfoSchema = z.object({
     provider: z.enum(languageModelProviders).describe("The model provider (e.g., 'anthropic', 'openai')"),
     model: z.string().describe("The model ID"),
     displayName: z.string().optional().describe("Optional display name for the model"),
-    inputModalities: z.array(z.enum(['text', 'image', 'audio', 'video'])).default(['text']).describe("The input modalities the model can accept (images, audio, video, text). Single-medium attachments are gated by these. Defaults to text-only."),
-    supportedDocumentTypes: z.array(z.enum(['pdf'])).default([]).describe("Rich compound document formats (e.g. PDF) the model can ingest natively, distinct from single-medium attachments gated by inputModalities. Defaults to none."),
+    inputModalities: z.array(z.enum(inputModalities)).default(['text']).describe("The input modalities the model can accept (images, audio, video, text). Single-medium attachments are gated by these. Defaults to text-only."),
+    supportedDocumentTypes: z.array(z.enum(documentTypes)).default([]).describe("Rich compound document formats (e.g. PDF) the model can ingest natively, distinct from single-medium attachments gated by inputModalities. Defaults to none."),
 });
 
 /**

From dbcfc8a6036429193194705352194fed248ceb79 Mon Sep 17 00:00:00 2001
From: whoisthey <joedimagio23@proton.me>
Date: Sat, 27 Jun 2026 13:28:33 -0700
Subject: [PATCH 09/10] remove blocking models.dev catalog request and add
 cache warm on startup

---
 .../chat/modelCapabilities.server.test.ts     | 17 ++++++++--
 .../features/chat/modelsDevCatalog.server.ts  | 32 +++++++++++--------
 packages/web/src/initialize.ts                | 12 +++++++
 3 files changed, 45 insertions(+), 16 deletions(-)

diff --git a/packages/web/src/features/chat/modelCapabilities.server.test.ts b/packages/web/src/features/chat/modelCapabilities.server.test.ts
index 0a2e9a2ec..4cd4121bf 100644
--- a/packages/web/src/features/chat/modelCapabilities.server.test.ts
+++ b/packages/web/src/features/chat/modelCapabilities.server.test.ts
@@ -104,17 +104,28 @@ describe('resolveModelCapabilities', () => {
         vi.unstubAllGlobals();
     });
 
-    test('fetches the catalog once and resolves capabilities (incl. provider mapping)', async () => {
+    test('fetches the catalog once in the background and resolves capabilities (incl. provider mapping)', async () => {
         const fetchMock = vi.fn(async () => ({
             ok: true,
             json: async () => catalog,
         }) as unknown as Response);
         vi.stubGlobal('fetch', fetchMock);
 
+        // The request path never blocks on the fetch: the first lookup kicks off
+        // the background fetch and falls back to text-only while it's in flight.
         expect(await resolveModelCapabilities(model('anthropic', 'claude-sonnet-4-5'))).toEqual({
-            inputModalities: ['text', 'image'],
-            supportedDocumentTypes: ['pdf'],
+            inputModalities: ['text'],
+            supportedDocumentTypes: [],
         });
+
+        // Once the background fetch settles, lookups resolve from the cached catalog.
+        await vi.waitFor(async () => {
+            expect(await resolveModelCapabilities(model('anthropic', 'claude-sonnet-4-5'))).toEqual({
+                inputModalities: ['text', 'image'],
+                supportedDocumentTypes: ['pdf'],
+            });
+        });
+
         // Subsequent lookups reuse the cached catalog rather than refetching.
         expect(await resolveModelCapabilities(model('google-generative-ai', 'gemini-2.5-pro'))).toEqual({
             inputModalities: ['text', 'image', 'audio', 'video'],
diff --git a/packages/web/src/features/chat/modelsDevCatalog.server.ts b/packages/web/src/features/chat/modelsDevCatalog.server.ts
index 8f6b35043..f2344b6f7 100644
--- a/packages/web/src/features/chat/modelsDevCatalog.server.ts
+++ b/packages/web/src/features/chat/modelsDevCatalog.server.ts
@@ -13,8 +13,10 @@ const FETCH_TIMEOUT_MS = 8000;
 // Re-fetch the (~2.4 MB) catalog at most once per this interval per server
 // process. New models trickle in daily; a stale window for a few hours is fine.
 const CATALOG_TTL_MS = 6 * 60 * 60 * 1000;
-// After a failed fetch, don't reattempt for this long. Without it, an outage in
-// models.dev would make every chat send pay the fetch timeout on the request path.
+// After a failed fetch, don't reattempt for this long. Since the request path
+// never blocks on the fetch (see loadCatalog), this throttles background
+// refresh attempts to once per interval during a models.dev outage instead of
+// kicking one off on (nearly) every request.
 const NEGATIVE_CACHE_MS = 60 * 1000;
 
 // Sourcebot provider id -> models.dev top-level catalog key. Only providers
@@ -75,9 +77,16 @@ const fetchCatalog = async (): Promise<ModelsDevCatalog | null> => {
 
 /**
  * Returns the cached models.dev catalog, refreshing it in the background when
- * stale. Only the very first load blocks on the network; thereafter the
- * last-known-good catalog is served immediately (even if stale) so the request
- * path never waits on models.dev.
+ * stale. The request path NEVER blocks on the network: the last-known-good
+ * catalog is returned immediately (even if stale), or null before the first
+ * successful fetch lands, and any refresh settles in the background.
+ *
+ * Consequences of never awaiting:
+ * - For the brief window after a cold start (before the first fetch resolves),
+ *   capability resolution falls back to text-only; it self-heals on the next
+ *   request once the background fetch populates the cache.
+ * - An unreachable catalog (e.g. an airgapped deployment) costs nothing on the
+ *   request path instead of repeatedly paying the fetch timeout.
  */
 export const loadCatalog = async (): Promise<ModelsDevCatalog | null> => {
     const now = Date.now();
@@ -87,7 +96,8 @@ export const loadCatalog = async (): Promise<ModelsDevCatalog | null> => {
     // Kick off a (deduped) refresh when the cache is stale/empty and we're not
     // within the post-failure backoff window. On success it replaces the cache;
     // on failure it only records the failure time, leaving the last-known-good
-    // catalog intact.
+    // catalog intact. The promise is intentionally not awaited here so the
+    // request path never waits on models.dev.
     if (!isFresh && !isBackingOff && !inFlightFetch) {
         inFlightFetch = fetchCatalog().then((catalog) => {
             if (catalog) {
@@ -101,11 +111,7 @@ export const loadCatalog = async (): Promise<ModelsDevCatalog | null> => {
         });
     }
 
-    // Once a catalog has loaded once, never block the request path on the
-    // network: serve the last-known-good value (even if stale) and let any
-    // refresh settle in the background. Only the very first load awaits.
-    if (cachedCatalog !== null) {
-        return cachedCatalog;
-    }
-    return inFlightFetch ?? null;
+    // Serve whatever we currently have cached (possibly null on a cold start)
+    // and let any in-flight refresh settle in the background.
+    return cachedCatalog;
 };
diff --git a/packages/web/src/initialize.ts b/packages/web/src/initialize.ts
index 0a8eb90f9..a63581ad1 100644
--- a/packages/web/src/initialize.ts
+++ b/packages/web/src/initialize.ts
@@ -4,6 +4,8 @@ import { startChangelogPollingJob } from '@/features/changelog/pollChangelog';
 import { createLogger, env } from "@sourcebot/shared";
 import { hasEntitlement } from '@/lib/entitlements';
 import { SINGLE_TENANT_ORG_ID } from './lib/constants';
+import { getConfiguredLanguageModels } from '@/features/chat/utils.server';
+import { loadCatalog } from '@/features/chat/modelsDevCatalog.server';
 
 const logger = createLogger('web-initialize');
 
@@ -73,8 +75,18 @@ const init = async () => {
     }
 }
 
+const warmModelCapabilitiesCatalog = async () => {
+    const configuredModels = await getConfiguredLanguageModels();
+    if (configuredModels.length === 0) {
+        return;
+    }
+    logger.info(`Warming models.dev capability catalog for ${configuredModels.length} configured language model(s)`);
+    void loadCatalog();
+};
+
 (async () => {
     await init();
     startServicePingCronJob();
     startChangelogPollingJob();
+    await warmModelCapabilitiesCatalog();
 })();

From 7ba297bca6208a1f0cf26e5b6b4d19dd5d91886c Mon Sep 17 00:00:00 2001
From: whoisthey <joedimagio23@proton.me>
Date: Sat, 27 Jun 2026 13:45:34 -0700
Subject: [PATCH 10/10] cleanup warming

---
 .../features/chat/modelContextWindow.test.ts  | 13 ++++++++--
 .../web/src/features/chat/utils.server.ts     | 26 ++++++++++++++++++-
 packages/web/src/initialize.ts                | 14 ++--------
 3 files changed, 38 insertions(+), 15 deletions(-)

diff --git a/packages/web/src/features/chat/modelContextWindow.test.ts b/packages/web/src/features/chat/modelContextWindow.test.ts
index 9476820ae..818251a3f 100644
--- a/packages/web/src/features/chat/modelContextWindow.test.ts
+++ b/packages/web/src/features/chat/modelContextWindow.test.ts
@@ -81,13 +81,19 @@ describe('resolveContextWindow', () => {
         vi.unstubAllGlobals();
     });
 
-    test('fetches the catalog once and resolves windows (incl. provider mapping)', async () => {
+    test('fetches the catalog once in the background and resolves windows (incl. provider mapping)', async () => {
         const fetchMock = vi.fn(async () => ({
             ok: true,
             json: async () => catalog,
         }) as unknown as Response);
         vi.stubGlobal('fetch', fetchMock);
 
+        // The request path never blocks on the fetch: the first lookup kicks off
+        // the background fetch and falls back to "unknown" while it's in flight.
+        expect(await resolveContextWindow(model('anthropic', 'claude-sonnet-4-5'))).toBeUndefined();
+
+        // Once the background fetch settles, lookups resolve from the cached catalog.
+        await new Promise((resolve) => setTimeout(resolve, 0));
         expect(await resolveContextWindow(model('anthropic', 'claude-sonnet-4-5'))).toBe(200000);
         // Subsequent lookups reuse the cached catalog rather than refetching.
         expect(await resolveContextWindow(model('google-generative-ai', 'gemini-2.5-pro'))).toBe(1048576);
@@ -141,7 +147,10 @@ describe('resolveContextWindow resilience', () => {
 
         const mod = await importFresh();
 
-        // First load populates the cache.
+        // First load kicks off the background fetch (returning the "unknown"
+        // fallback until it settles), which then populates the cache.
+        expect(await mod.resolveContextWindow(model('anthropic', 'claude-sonnet-4-5'))).toBeUndefined();
+        await new Promise((resolve) => setTimeout(resolve, 0));
         expect(await mod.resolveContextWindow(model('anthropic', 'claude-sonnet-4-5'))).toBe(200000);
         expect(fetchMock).toHaveBeenCalledTimes(1);
 
diff --git a/packages/web/src/features/chat/utils.server.ts b/packages/web/src/features/chat/utils.server.ts
index a458c41d8..90c83c859 100644
--- a/packages/web/src/features/chat/utils.server.ts
+++ b/packages/web/src/features/chat/utils.server.ts
@@ -3,16 +3,19 @@ import 'server-only';
 import { getAnonymousId } from '@/lib/anonymousId';
 import { Chat, Prisma, PrismaClient, User } from '@sourcebot/db';
 import { LanguageModel } from '@sourcebot/schemas/v3/languageModel.type';
-import { env, loadConfig } from '@sourcebot/shared';
+import { createLogger, env, loadConfig } from '@sourcebot/shared';
 import fs from 'fs';
 import path from 'path';
 import { LanguageModelInfo, SBChatMessage } from './types';
 import { resolveModelCapabilities } from './modelCapabilities.server';
+import { loadCatalog } from './modelsDevCatalog.server';
 import { hasEntitlement } from '@/lib/entitlements';
 import { ServiceError } from '@/lib/serviceError';
 import { ErrorCode } from '@/lib/errorCodes';
 import { StatusCodes } from 'http-status-codes';
 
+const logger = createLogger('chat-utils');
+
 /**
  * Returns a FORBIDDEN ServiceError when the deployment lacks the `ask`
  * entitlement, or null when Ask is available. Gates the generative chat
@@ -139,3 +142,24 @@ export const getConfiguredLanguageModelsInfo = async () => {
         };
     }));
 };
+
+/**
+ * Eagerly warms the models.dev capability catalog at server startup so the first
+ * request after a cold start resolves real model capabilities instead of the
+ * text-only fallback. No-op when no language models are configured (avoids a
+ * gratuitous outbound call for deployments not using Ask). Best-effort and
+ * non-blocking: loadCatalog kicks off a background fetch and returns immediately,
+ * and any unexpected error is logged rather than surfaced.
+ */
+export const warmModelCapabilitiesCatalog = (): void => {
+    void (async () => {
+        const configuredModels = await getConfiguredLanguageModels();
+        if (configuredModels.length === 0) {
+            return;
+        }
+        logger.info(`Warming models.dev capability catalog for ${configuredModels.length} configured language model(s)`);
+        void loadCatalog();
+    })().catch((error) => {
+        logger.error(`Failed to warm models.dev capability catalog: ${error}`);
+    });
+};
diff --git a/packages/web/src/initialize.ts b/packages/web/src/initialize.ts
index a63581ad1..406116dee 100644
--- a/packages/web/src/initialize.ts
+++ b/packages/web/src/initialize.ts
@@ -4,8 +4,7 @@ import { startChangelogPollingJob } from '@/features/changelog/pollChangelog';
 import { createLogger, env } from "@sourcebot/shared";
 import { hasEntitlement } from '@/lib/entitlements';
 import { SINGLE_TENANT_ORG_ID } from './lib/constants';
-import { getConfiguredLanguageModels } from '@/features/chat/utils.server';
-import { loadCatalog } from '@/features/chat/modelsDevCatalog.server';
+import { warmModelCapabilitiesCatalog } from '@/features/chat/utils.server';
 
 const logger = createLogger('web-initialize');
 
@@ -75,18 +74,9 @@ const init = async () => {
     }
 }
 
-const warmModelCapabilitiesCatalog = async () => {
-    const configuredModels = await getConfiguredLanguageModels();
-    if (configuredModels.length === 0) {
-        return;
-    }
-    logger.info(`Warming models.dev capability catalog for ${configuredModels.length} configured language model(s)`);
-    void loadCatalog();
-};
-
 (async () => {
     await init();
     startServicePingCronJob();
     startChangelogPollingJob();
-    await warmModelCapabilitiesCatalog();
+    warmModelCapabilitiesCatalog();
 })();