step1.自动化爬取TCMSP中药成分

该文章已生成可运行项目,
pip install requests beautifulsoup4 pandas

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def get_page_content(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        else:
            print(f"Failed to retrieve page {url}, status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error occurred while retrieving page {url}: {e}")
        return None

def parse_tcmsp_table(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    # 假设成分信息存储在一个具有特定ID或class的table标签内
    table = soup.find('HuangQi', {'id': 'target_table_id'}) # 替换为实际的table标识符
    rows = []
    for tr in table.find_all('tr')[1:]:
        row = [td.get_text(strip=True) for td in tr.find_all('td')]
        rows.append(row)
    return rows

def save_to_csv(data, filename='tcmsp_data.csv'):
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False, header=['OB', 'Column2', ...]) # 根据实际情况调整列名


base_url = "http://example.com/path?" # 更改为你目标的中药
urls = [base_url + str(i) for i in range(1, 10)] # 构造多个页面链接
all_data = []

for url in urls:
    html = get_page_content(url)
    if html:
        data = parse_tcmsp_table(html)
        all_data.extend(data)
        time.sleep(5) # 设置延迟避免过于频繁访问

save_to_csv(all_data)
  • 使用requests库处理HTTP请求,BeautifulSoup解析HTML结构,pandas存储数据

  • 支持动态翻页处理,自动识别页码并循环抓取(如25个靶点自动分3页处理)

    import requests
    from bs4 import BeautifulSoup
    import pandas as pd
    
    def get_tcmsp_targets(compound_id):
        base_url = f'https://old.tcmsp-e.com/molecule.php?qn={compound_id}'
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}
        response = requests.get(base_url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # 动态获取总页数
        total_items = int(soup.find('div', class_='pagination').text.split('/')[-1].split()[0])
        pages = (total_items + 9) // 10  # 计算页码
        
        targets = []
        for page in range(1, pages + 1):
            page_url = f'{base_url}&page={page}'
            page_response = requests.get(page_url, headers=headers)
            page_soup = BeautifulSoup(page_response.text, 'html.parser')
            target_list = page_soup.find_all('div', class_='target-item')
            for item in target_list:
                targets.append({
                    'target_name': item.find('span', class_='target-name').text,
                    'source_db': item.find('span', class_='source-db').text
                })
        
        return pd.DataFrame(targets)

    也可以用R语言实现

  • library(RSelenium)
    library(rvest)
    
    # 启动Docker容器
    remDr <- remoteDriver(remoteServerAddr = "localhost", port = 4444L, browserName = "chrome")
    remDr$open()
    
    # 爬取单页靶点
    get_page_targets <- function(url) {
      remDr$navigate(url)
      page_source <- remDr$getPageSource()[[1]]
      page <- read_html(page_source)
      targets <- page %>% html_nodes(".target-item") %>% html_text()
      return(targets)
    }
    
    # 批量处理
    compound_ids <- c("1002", "1454", "2659")  # 示例ID列表
    all_targets <- lapply(compound_ids, function(id) {
      url <- paste0("https://old.tcmsp-e.com/molecule.php?qn=", id)
      get_page_targets(url)
    })

本文章已经生成可运行项目
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值