{
"openapi": "3.0.0",
"info": {
"title": "RAG Web Browser",
"description": "Web browser for OpenAI Assistants API and RAG pipelines, similar to a web browser in ChatGPT. It queries Google Search, scrapes the top N pages from the results, and returns their cleaned content as Markdown for further processing by an LLM.",
"version": "v1"
},
"servers": [
{
"url": "https://rag-web-browser.apify.actor/"
}
],
"paths": {
"/search": {
"get": {
"operationId": "apify_rag-web-browser",
"x-openai-isConsequential": false,
"description": "Web browser for OpenAI Assistants API and RAG pipelines, similar to a web browser in ChatGPT. It queries Google Search, scrapes the top N pages from the results, and returns their cleaned content as Markdown for further processing by an LLM.",
"summary": "Web browser for OpenAI Assistants API and RAG pipelines, similar to a web browser in ChatGPT. It queries Google Search, scrapes the top N pages from the results, and returns their cleaned content as Markdown for further processing by an LLM.",
"parameters": [
{
"name": "query",
"in": "query",
"description": "Enter Google Search keywords or a URL of a specific web page. The keywords might include the [advanced search operators](https://blog.apify.com/how-to-scrape-google-like-a-pro/). Examples:\n\n- san francisco weather
\n- https://www.cnn.com
\n- function calling site:openai.com
",
"required": true,
"schema": {
"type": "string",
"pattern": "[^\\s]+"
}
},
{
"name": "maxResults",
"in": "query",
"description": "The maximum number of top organic Google Search results whose web pages will be extracted. If `query` is a URL, then this field is ignored and the Actor only fetches the specific web page.",
"required": false,
"schema": {
"type": "integer",
"minimum": 1,
"maximum": 100,
"default": 3
}
},
{
"name": "outputFormats",
"in": "query",
"description": "Select one or more formats to which the target web pages will be extracted and saved in the resulting dataset.",
"required": false,
"schema": {
"type": "array",
"items": {
"type": "string",
"enum": [
"text",
"markdown",
"html"
]
},
"default": [
"markdown"
]
},
"style": "form",
"explode": false
},
{
"name": "requestTimeoutSecs",
"in": "query",
"description": "The maximum time in seconds available for the request, including querying Google Search and scraping the target web pages. For example, OpenAI allows only [45 seconds](https://platform.openai.com/docs/actions/production#timeouts) for custom actions. If a target page loading and extraction exceeds this timeout, the corresponding page will be skipped in results to ensure at least some results are returned within the timeout. If no page is extracted within the timeout, the whole request fails.",
"required": false,
"schema": {
"type": "integer",
"minimum": 1,
"maximum": 600,
"default": 40
}
},
{
"name": "serpProxyGroup",
"in": "query",
"description": "Enables overriding the default Apify Proxy group used for fetching Google Search results.",
"required": false,
"schema": {
"type": "string",
"enum": [
"GOOGLE_SERP",
"SHADER"
],
"default": "GOOGLE_SERP"
}
},
{
"name": "serpMaxRetries",
"in": "query",
"description": "The maximum number of times the Actor will retry fetching the Google Search results on error. If the last attempt fails, the entire request fails.",
"required": false,
"schema": {
"type": "integer",
"minimum": 0,
"maximum": 3,
"default": 1
}
},
{
"name": "scrapingTool",
"in": "query",
"description": "Select a scraping tool for extracting the target web pages. The Browser tool is more powerful and can handle JavaScript heavy websites, while the Plain HTML tool can't handle JavaScript but is about two times faster.",
"required": false,
"schema": {
"type": "string",
"enum": [
"browser-playwright",
"raw-http"
],
"default": "raw-http"
}
},
{
"name": "removeElementsCssSelector",
"in": "query",
"required": false,
"description": "A CSS selector matching HTML elements that will be removed from the DOM, before converting it to text, Markdown, or saving as HTML. This is useful to skip irrelevant page content. The value must be a valid CSS selector as accepted by the `document.querySelectorAll()` function. \n\nBy default, the Actor removes common navigation elements, headers, footers, modals, scripts, and inline image. You can disable the removal by setting this value to some non-existent CSS selector like `dummy_keep_everything`.",
"schema": {
"type": "string",
"default": "nav, footer, script, style, noscript, svg, img[src^='data:'],\n[role=\"alert\"],\n[role=\"banner\"],\n[role=\"dialog\"],\n[role=\"alertdialog\"],\n[role=\"region\"][aria-label*=\"skip\" i],\n[aria-modal=\"true\"]"
}
},
{
"name": "maxRequestRetries",
"in": "query",
"description": "The maximum number of times the Actor will retry loading the target web page on error. If the last attempt fails, the page will be skipped in the results.",
"required": false,
"schema": {
"type": "integer",
"minimum": 0,
"maximum": 3,
"default": 1
}
},
{
"name": "dynamicContentWaitSecs",
"in": "query",
"description": "The maximum time in seconds to wait for dynamic page content to load. The Actor considers the web page as fully loaded once this time elapses or when the network becomes idle.",
"required": false,
"schema": {
"type": "integer",
"default": 10
}
},
{
"name": "removeCookieWarnings",
"in": "query",
"description": "If enabled, the Actor attempts to close or remove cookie consent dialogs to improve the quality of extracted text. Note that this setting increases the latency.",
"required": false,
"schema": {
"type": "boolean",
"default": true
}
},
{
"name": "debugMode",
"in": "query",
"description": "If enabled, the Actor will store debugging information into the resulting dataset under the `debug` field.",
"required": false,
"schema": {
"type": "boolean",
"default": false
}
}
],
"responses": {
"200": {
"description": "OK"
}
}
}
}
}
}