fix: update Gemini URL scraper prompt to extract exact content instead of summaries

claude[bot] · MrOrz · claude[bot] · commit ef1ca5f2c436 · 2025-08-24T14:35:18.000Z
- Changed prompt to explicitly request original text content without rewriting
- Added clear instructions to preserve all claims and statements as written
- This ensures content can be properly indexed for fact-checking purposes

Co-authored-by: Johnson Liang &lt;MrOrz@users.noreply.github.com&gt;
diff --git a/src/util/geminiUrlScraper.js b/src/util/geminiUrlScraper.js
@@ -41,7 +41,7 @@ export default async function scrapeUrlsWithGemini(urls) {
           role: 'user',
           parts: [
             {
-              text: `Please analyze the content at these URLs and extract information from each:
+              text: `Please analyze the content at these URLs and extract the exact original text content from each:
 
 ${urlList}
 
@@ -51,16 +51,16 @@ For each URL, extract and return a JSON array with objects having the following
     "url": "original URL from the list",
     "canonical": "canonical URL if different from original, or same as original",
     "title": "The main title of the page",
-    "summary": "A comprehensive summary of the content that captures the key information for fact-checking purposes",
+    "summary": "The exact original text content from the page without any rewriting, summarization, or paraphrasing - preserve all claims, statements, and information as written in the source",
     "topImageUrl": "URL of the most representative image on the page, or null if none exists"
   }
 ]
 
 Requirements:
 - url: Return the exact original URL from the input list
 - canonical: Extract the canonical URL from meta tags or use the original URL if no canonical is found
-- title: Extract the main page title
-- summary: Should be detailed enough for search and fact-checking, capturing all important claims and information
+- title: Extract the main page title exactly as it appears
+- summary: Extract the complete original text content from the page - DO NOT summarize, rewrite, or paraphrase. Keep all original claims, statements, facts, and information exactly as written in the source material. This text will be used for indexing and search purposes.
 - topImageUrl: Find the most representative image (not logos, ads, or decorative images), return null if no suitable image exists
 - Return valid JSON array only, no markdown code blocks or explanations
 - Process all URLs and return results for each, even if some fail`,