|
| 1 | +import os |
| 2 | +import time |
| 3 | +import pandas as pd |
| 4 | +import re |
| 5 | +from dotenv import load_dotenv |
| 6 | +from selenium import webdriver |
| 7 | +from selenium.webdriver.chrome.service import Service |
| 8 | +from selenium.webdriver.chrome.options import Options |
| 9 | +from selenium.webdriver.common.by import By |
| 10 | +from selenium.webdriver.support.ui import WebDriverWait |
| 11 | +from selenium.webdriver.support import expected_conditions as EC |
| 12 | +from openai import OpenAI |
| 13 | +from openpyxl import load_workbook |
| 14 | +from openpyxl.styles import Font, Alignment |
| 15 | + |
| 16 | +# Load environment variables |
| 17 | +load_dotenv(override=True) |
| 18 | +api_key = os.getenv('OPENAI_API_KEY') |
| 19 | + |
| 20 | +# Validate API Key |
| 21 | +if not api_key: |
| 22 | + raise ValueError("No API key was found - please check your .env file.") |
| 23 | + |
| 24 | +# Initialize OpenAI client |
| 25 | +openai = OpenAI() |
| 26 | + |
| 27 | +# Set up Selenium WebDriver |
| 28 | +chrome_options = Options() |
| 29 | +chrome_options.add_argument("--headless") |
| 30 | +chrome_options.add_argument("--disable-gpu") |
| 31 | +chrome_options.add_argument("--no-sandbox") |
| 32 | +chrome_options.add_argument("--disable-dev-shm-usage") |
| 33 | + |
| 34 | +class Website: |
| 35 | + """Scrapes and processes website content using Selenium.""" |
| 36 | + |
| 37 | + def __init__(self, url: str): |
| 38 | + self.url = url |
| 39 | + self.text = "No content extracted." |
| 40 | + |
| 41 | + service = Service(executable_path="/opt/homebrew/bin/chromedriver") |
| 42 | + driver = webdriver.Chrome(service=service, options=chrome_options) |
| 43 | + |
| 44 | + try: |
| 45 | + driver.get(url) |
| 46 | + WebDriverWait(driver, 10).until( |
| 47 | + EC.presence_of_element_located((By.TAG_NAME, "body")) |
| 48 | + ) |
| 49 | + body_element = driver.find_element(By.TAG_NAME, "body") |
| 50 | + self.text = body_element.text.strip() if body_element else "No content extracted." |
| 51 | + except Exception as e: |
| 52 | + print(f"Error fetching website: {e}") |
| 53 | + finally: |
| 54 | + driver.quit() |
| 55 | + |
| 56 | + def summarized_text(self, max_length=1500): |
| 57 | + return self.text[:max_length] + ("..." if len(self.text) > max_length else "") |
| 58 | + |
| 59 | +def clean_text(text): |
| 60 | + """ |
| 61 | + Cleans extracted text by removing markdown-style formatting. |
| 62 | + """ |
| 63 | + text = re.sub(r"###*\s*", "", text) |
| 64 | + text = re.sub(r"\*\*(.*?)\*\*", r"\1", text) |
| 65 | + return text.strip() |
| 66 | + |
| 67 | +# Aspect-specific prompts for concise output |
| 68 | +aspect_prompts = { |
| 69 | + "Marketing Strategies": "Summarize the core marketing strategies used on this website in in under 30 words. Do not include a title or introduction.", |
| 70 | + "SEO Keywords": "List only the most relevant SEO keywords from this website, separated by commas. Do not include a title or introduction.", |
| 71 | + "User Engagement Tactics": "List key engagement tactics used on this website (e.g., interactive features, user incentives, social proof). Keep responses to 3-5 bullet points. Do not include a title or introduction.", |
| 72 | + "Call-to-Action Phrases": "List only the most common Call-to-Action phrases used on this website, separated by commas. Do not include a title or introduction.", |
| 73 | + "Branding Elements": "Summarize the brand's tone, style, and positioning in under 30 words. Do not include a title or introduction.", |
| 74 | + "Competitor Comparison": "Briefly describe how this website differentiates itself from competitors in under 30 words. Do not include a title or introduction.", |
| 75 | + "Product Descriptions": "List the most important features or benefits of the products/services described on this website in under 30 words. Do not include a title or introduction.", |
| 76 | + "Customer Reviews Sentiment": "Summarize the overall sentiment of customer reviews in oin under 30 words, highlighting common themes. Do not include a title or introduction.", |
| 77 | + "Social Media Strategy": "List key social media strategies used on this website, separated by commas. Do not include a title or introduction." |
| 78 | +} |
| 79 | + |
| 80 | + |
| 81 | +def summarize(url: str) -> dict: |
| 82 | + """ |
| 83 | + Fetches a website, extracts relevant content, and generates a separate summary for each aspect. |
| 84 | +
|
| 85 | + :param url: The website URL to analyze. |
| 86 | + :return: A dictionary containing extracted information. |
| 87 | + """ |
| 88 | + website = Website(url) |
| 89 | + |
| 90 | + if not website.text or website.text == "No content extracted.": |
| 91 | + return {"URL": url, "Error": "Failed to extract content"} |
| 92 | + |
| 93 | + extracted_data = {"URL": url} |
| 94 | + |
| 95 | + for aspect, prompt in aspect_prompts.items(): |
| 96 | + try: |
| 97 | + formatted_prompt = f"{prompt} \n\nContent:\n{website.summarized_text()}" |
| 98 | + response = openai.chat.completions.create( |
| 99 | + model="gpt-4o-mini", |
| 100 | + messages=[ |
| 101 | + {"role": "system", "content": "You are an expert at extracting structured information from website content."}, |
| 102 | + {"role": "user", "content": formatted_prompt} |
| 103 | + ] |
| 104 | + ) |
| 105 | + |
| 106 | + extracted_data[aspect] = clean_text(response.choices[0].message.content) |
| 107 | + |
| 108 | + except Exception as e: |
| 109 | + extracted_data[aspect] = f"Error generating summary: {e}" |
| 110 | + |
| 111 | + return extracted_data |
| 112 | + |
| 113 | +def save_to_excel(data_list: list, filename="website_analysis.xlsx"): |
| 114 | + """ |
| 115 | + Saves extracted information to an Excel file with proper formatting. |
| 116 | +
|
| 117 | + :param data_list: A list of dictionaries containing extracted website details. |
| 118 | + :param filename: The name of the Excel file to save data. |
| 119 | + """ |
| 120 | + df = pd.DataFrame(data_list) |
| 121 | + |
| 122 | + df.to_excel(filename, index=False) |
| 123 | + |
| 124 | + wb = load_workbook(filename) |
| 125 | + ws = wb.active |
| 126 | + |
| 127 | + # Auto-adjust column widths |
| 128 | + for col in ws.columns: |
| 129 | + max_length = 0 |
| 130 | + col_letter = col[0].column_letter |
| 131 | + for cell in col: |
| 132 | + try: |
| 133 | + if cell.value: |
| 134 | + max_length = max(max_length, len(str(cell.value))) |
| 135 | + except: |
| 136 | + pass |
| 137 | + ws.column_dimensions[col_letter].width = min(max_length + 2, 50) |
| 138 | + |
| 139 | + # Format headers |
| 140 | + for cell in ws[1]: |
| 141 | + cell.font = Font(bold=True) |
| 142 | + cell.alignment = Alignment(horizontal="center", vertical="center") |
| 143 | + |
| 144 | + # Wrap text for extracted content |
| 145 | + for row in ws.iter_rows(min_row=2): |
| 146 | + for cell in row: |
| 147 | + cell.alignment = Alignment(wrap_text=True, vertical="top") |
| 148 | + |
| 149 | + wb.save(filename) |
| 150 | + print(f"Data saved to {filename} with improved formatting.") |
| 151 | + |
| 152 | +# 🔹 LIST OF WEBSITES TO PROCESS |
| 153 | +websites = [ |
| 154 | + "https://www.gymshark.com/", |
| 155 | +] |
| 156 | + |
| 157 | +if __name__ == "__main__": |
| 158 | + print("\nProcessing websites...\n") |
| 159 | + extracted_data_list = [] |
| 160 | + |
| 161 | + for site in websites: |
| 162 | + print(f"Extracting data from {site}...") |
| 163 | + extracted_data = summarize(site) |
| 164 | + extracted_data_list.append(extracted_data) |
| 165 | + |
| 166 | + save_to_excel(extracted_data_list) |
| 167 | + print("\nAll websites processed successfully!") |
0 commit comments