Skip to content

Commit c70c6c4

Browse files
committed
add a python script for an automated website content analysis & SEO extraction
1 parent 8ffd3a5 commit c70c6c4

File tree

1 file changed

+176
-0
lines changed

1 file changed

+176
-0
lines changed
Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
import os
2+
import time
3+
import pandas as pd
4+
import re
5+
from dotenv import load_dotenv
6+
from selenium import webdriver
7+
from selenium.webdriver.chrome.service import Service
8+
from selenium.webdriver.chrome.options import Options
9+
from selenium.webdriver.common.by import By
10+
from selenium.webdriver.support.ui import WebDriverWait
11+
from selenium.webdriver.support import expected_conditions as EC
12+
from openai import OpenAI
13+
from openpyxl import load_workbook
14+
from openpyxl.styles import Font, Alignment
15+
16+
# Load environment variables
17+
load_dotenv(override=True)
18+
api_key = os.getenv('OPENAI_API_KEY')
19+
20+
# Validate API Key
21+
if not api_key:
22+
raise ValueError("No API key was found - please check your .env file.")
23+
24+
# Initialize OpenAI client
25+
openai = OpenAI()
26+
27+
# Set up Selenium WebDriver
28+
chrome_options = Options()
29+
chrome_options.add_argument("--headless")
30+
chrome_options.add_argument("--disable-gpu")
31+
chrome_options.add_argument("--no-sandbox")
32+
chrome_options.add_argument("--disable-dev-shm-usage")
33+
34+
class Website:
35+
"""Scrapes and processes website content using Selenium."""
36+
37+
def __init__(self, url: str):
38+
self.url = url
39+
self.text = "No content extracted."
40+
41+
service = Service(executable_path="/opt/homebrew/bin/chromedriver")
42+
driver = webdriver.Chrome(service=service, options=chrome_options)
43+
44+
try:
45+
driver.get(url)
46+
WebDriverWait(driver, 10).until(
47+
EC.presence_of_element_located((By.TAG_NAME, "body"))
48+
)
49+
body_element = driver.find_element(By.TAG_NAME, "body")
50+
self.text = body_element.text.strip() if body_element else "No content extracted."
51+
except Exception as e:
52+
print(f"Error fetching website: {e}")
53+
finally:
54+
driver.quit()
55+
56+
def summarized_text(self, max_length=1500):
57+
return self.text[:max_length] + ("..." if len(self.text) > max_length else "")
58+
59+
def clean_text(text):
60+
"""
61+
Cleans extracted text by removing markdown-style formatting.
62+
"""
63+
text = re.sub(r"###*\s*", "", text)
64+
text = re.sub(r"\*\*(.*?)\*\*", r"\1", text)
65+
return text.strip()
66+
67+
# Aspect-specific prompts for concise output
68+
aspect_prompts = {
69+
"Marketing Strategies": "Summarize the core marketing strategies used on this website in in under 30 words. Do not include a title or introduction.",
70+
"SEO Keywords": "List only the most relevant SEO keywords from this website, separated by commas. Do not include a title or introduction.",
71+
"User Engagement Tactics": "List key engagement tactics used on this website (e.g., interactive features, user incentives, social proof). Keep responses to 3-5 bullet points. Do not include a title or introduction.",
72+
"Call-to-Action Phrases": "List only the most common Call-to-Action phrases used on this website, separated by commas. Do not include a title or introduction.",
73+
"Branding Elements": "Summarize the brand's tone, style, and positioning in under 30 words. Do not include a title or introduction.",
74+
"Competitor Comparison": "Briefly describe how this website differentiates itself from competitors in under 30 words. Do not include a title or introduction.",
75+
"Product Descriptions": "List the most important features or benefits of the products/services described on this website in under 30 words. Do not include a title or introduction.",
76+
"Customer Reviews Sentiment": "Summarize the overall sentiment of customer reviews in oin under 30 words, highlighting common themes. Do not include a title or introduction.",
77+
"Social Media Strategy": "List key social media strategies used on this website, separated by commas. Do not include a title or introduction."
78+
}
79+
80+
81+
def summarize(url: str) -> dict:
82+
"""
83+
Fetches a website, extracts relevant content, and generates a separate summary for each aspect.
84+
85+
:param url: The website URL to analyze.
86+
:return: A dictionary containing extracted information.
87+
"""
88+
website = Website(url)
89+
90+
if not website.text or website.text == "No content extracted.":
91+
return {"URL": url, "Error": "Failed to extract content"}
92+
93+
extracted_data = {"URL": url}
94+
95+
for aspect, prompt in aspect_prompts.items():
96+
try:
97+
formatted_prompt = f"{prompt} \n\nContent:\n{website.summarized_text()}"
98+
response = openai.chat.completions.create(
99+
model="gpt-4o-mini",
100+
messages=[
101+
{"role": "system", "content": "You are an expert at extracting structured information from website content."},
102+
{"role": "user", "content": formatted_prompt}
103+
]
104+
)
105+
106+
extracted_data[aspect] = clean_text(response.choices[0].message.content)
107+
108+
except Exception as e:
109+
extracted_data[aspect] = f"Error generating summary: {e}"
110+
111+
return extracted_data
112+
113+
def save_to_excel(data_list: list, filename="website_analysis.xlsx"):
114+
"""
115+
Saves extracted information to an Excel file with proper formatting.
116+
117+
:param data_list: A list of dictionaries containing extracted website details.
118+
:param filename: The name of the Excel file to save data.
119+
"""
120+
df = pd.DataFrame(data_list)
121+
122+
df.to_excel(filename, index=False)
123+
124+
wb = load_workbook(filename)
125+
ws = wb.active
126+
127+
# Auto-adjust column widths
128+
for col in ws.columns:
129+
max_length = 0
130+
col_letter = col[0].column_letter
131+
for cell in col:
132+
try:
133+
if cell.value:
134+
max_length = max(max_length, len(str(cell.value)))
135+
except:
136+
pass
137+
ws.column_dimensions[col_letter].width = min(max_length + 2, 50)
138+
139+
# Format headers
140+
for cell in ws[1]:
141+
cell.font = Font(bold=True)
142+
cell.alignment = Alignment(horizontal="center", vertical="center")
143+
144+
# Wrap text for extracted content
145+
for row in ws.iter_rows(min_row=2):
146+
for cell in row:
147+
cell.alignment = Alignment(wrap_text=True, vertical="top")
148+
149+
wb.save(filename)
150+
print(f"Data saved to {filename} with improved formatting.")
151+
152+
# 🔹 LIST OF WEBSITES TO PROCESS
153+
websites = [
154+
"https://www.udacity.com/",
155+
"https://www.coursera.org",
156+
"https://www.udemy.com",
157+
"https://www.edx.org",
158+
"https://www.freecodecamp.org/",
159+
"https://www.datacamp.com/",
160+
"https://www.w3schools.com/",
161+
"https://www.futurelearn.com/",
162+
"https://codefirstgirls.com/",
163+
"https://www.linkedin.com/learning",
164+
]
165+
166+
if __name__ == "__main__":
167+
print("\nProcessing websites...\n")
168+
extracted_data_list = []
169+
170+
for site in websites:
171+
print(f"Extracting data from {site}...")
172+
extracted_data = summarize(site)
173+
extracted_data_list.append(extracted_data)
174+
175+
save_to_excel(extracted_data_list)
176+
print("\nAll websites processed successfully!")

0 commit comments

Comments
 (0)