pip install requests beautifulsoup4 pandas
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
def get_page_content(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
print(f"Failed to retrieve page {url}, status code: {response.status_code}")
return None
except Exception as e:
print(f"Error occurred while retrieving page {url}: {e}")
return None
def parse_tcmsp_table(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
# 假设成分信息存储在一个具有特定ID或class的table标签内
table = soup.find('HuangQi', {'id': 'target_table_id'}) # 替换为实际的table标识符
rows = []
for tr in table.find_all('tr')[1:]:
row = [td.get_text(strip=True) for td in tr.find_all('td')]
rows.append(row)
return rows
def save_to_csv(data, filename='tcmsp_data.csv'):
df = pd.DataFrame(data)
df.to_csv(filename, index=False, header=['OB', 'Column2', ...]) # 根据实际情况调整列名
base_url = "http://example.com/path?" # 更改为你目标的中药
urls = [base_url + str(i) for i in range(1, 10)] # 构造多个页面链接
all_data = []
for url in urls:
html = get_page_content(url)
if html:
data = parse_tcmsp_table(html)
all_data.extend(data)
time.sleep(5) # 设置延迟避免过于频繁访问
save_to_csv(all_data)
-
使用
requests库处理HTTP请求,BeautifulSoup解析HTML结构,pandas存储数据 -
支持动态翻页处理,自动识别页码并循环抓取(如25个靶点自动分3页处理)
import requests from bs4 import BeautifulSoup import pandas as pd def get_tcmsp_targets(compound_id): base_url = f'https://old.tcmsp-e.com/molecule.php?qn={compound_id}' headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'} response = requests.get(base_url, headers=headers) soup = BeautifulSoup(response.text, 'html.parser') # 动态获取总页数 total_items = int(soup.find('div', class_='pagination').text.split('/')[-1].split()[0]) pages = (total_items + 9) // 10 # 计算页码 targets = [] for page in range(1, pages + 1): page_url = f'{base_url}&page={page}' page_response = requests.get(page_url, headers=headers) page_soup = BeautifulSoup(page_response.text, 'html.parser') target_list = page_soup.find_all('div', class_='target-item') for item in target_list: targets.append({ 'target_name': item.find('span', class_='target-name').text, 'source_db': item.find('span', class_='source-db').text }) return pd.DataFrame(targets)也可以用R语言实现
-
library(RSelenium) library(rvest) # 启动Docker容器 remDr <- remoteDriver(remoteServerAddr = "localhost", port = 4444L, browserName = "chrome") remDr$open() # 爬取单页靶点 get_page_targets <- function(url) { remDr$navigate(url) page_source <- remDr$getPageSource()[[1]] page <- read_html(page_source) targets <- page %>% html_nodes(".target-item") %>% html_text() return(targets) } # 批量处理 compound_ids <- c("1002", "1454", "2659") # 示例ID列表 all_targets <- lapply(compound_ids, function(id) { url <- paste0("https://old.tcmsp-e.com/molecule.php?qn=", id) get_page_targets(url) })
210

被折叠的 条评论
为什么被折叠?



