Files
cyzg/crawler/get_tianyi_1.ipynb
2025-08-21 17:11:58 +08:00

299 lines
8.0 KiB
Plaintext
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"id": "initial_id",
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2025-08-21T08:51:45.729834Z",
"start_time": "2025-08-21T08:51:45.724789Z"
}
},
"source": [
"import time\n",
"\n",
"from selenium import webdriver\n",
"from selenium.webdriver.edge.service import Service\n",
"from selenium.webdriver.common.by import By\n",
"from selenium.webdriver.support.ui import WebDriverWait\n",
"from selenium.webdriver.support import expected_conditions as EC\n",
"from selenium.webdriver.edge.options import Options"
],
"outputs": [],
"execution_count": 10
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-21T08:51:45.747072Z",
"start_time": "2025-08-21T08:51:45.743237Z"
}
},
"cell_type": "code",
"source": [
"from bs4 import BeautifulSoup\n",
"import sqlite3"
],
"id": "f184b255d5098302",
"outputs": [],
"execution_count": 11
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-21T08:51:45.773737Z",
"start_time": "2025-08-21T08:51:45.769129Z"
}
},
"cell_type": "code",
"source": [
"db_path = '../data.db'\n",
"conn = sqlite3.connect(db_path)"
],
"id": "4813fcf4dea28b8d",
"outputs": [],
"execution_count": 12
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-21T08:51:45.797561Z",
"start_time": "2025-08-21T08:51:45.790647Z"
}
},
"cell_type": "code",
"source": [
"edge_options = Options()\n",
"#edge_options.add_argument(\"--headless\") # 可选:无界面模式\n",
"edge_options.add_argument(\"--disable-gpu\")\n",
"edge_options.add_argument(\"--no-sandbox\")\n",
"edge_options.add_argument(\"--disable-extensions\")\n",
"edge_options.add_argument(\"--disable-plugins\")\n",
"edge_options.add_argument(\"--disable-popup-blocking\")\n",
"edge_options.add_argument(\"--disable-infobars\")\n",
"edge_options.add_argument(\"--disable-notifications\")\n",
"edge_options.add_argument(\"--no-first-run\")\n",
"edge_options.add_argument(\"--no-default-browser-check\")\n",
"\n",
"user_data_dir = r\"D:\\code\\edge\"\n",
"edge_options.add_argument(f\"--user-data-dir={user_data_dir}\")\n",
"# 指定配置文件(可选,默认是 Default\n",
"edge_options.add_argument(\"--profile-directory=Default\")"
],
"id": "e5632e44a52d5dc4",
"outputs": [],
"execution_count": 13
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-21T08:51:47.136948Z",
"start_time": "2025-08-21T08:51:45.814223Z"
}
},
"cell_type": "code",
"source": [
"# 指定 EdgeDriver 路径(可选,若已配置环境变量可省略)\n",
"service = Service(executable_path=r\"D:\\app\\edgeDriver\\msedgedriver.exe\")\n",
"# 创建 Edge 浏览器实例\n",
"driver = webdriver.Edge(service=service, options=edge_options)"
],
"id": "28b1479c3decc6b1",
"outputs": [],
"execution_count": 14
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-21T08:51:53.084554Z",
"start_time": "2025-08-21T08:51:47.152588Z"
}
},
"cell_type": "code",
"source": [
"driver.get(\"https://www.tianyiwangxiao.com/new/question-bank/learn-center-analyze/4d60c96ef05c452b812654e78af7701a/1957604601548296194?from=ht2\")\n",
"\n",
"# 等待页面渲染完成(例如等待 body 加载)\n",
"wait = WebDriverWait(driver, 720)\n",
"wait.until(EC.presence_of_element_located((By.TAG_NAME, \"body\")))\n",
"time.sleep(3)\n",
"\n"
],
"id": "779f88e1c3670c02",
"outputs": [],
"execution_count": 15
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-21T08:51:59.934014Z",
"start_time": "2025-08-21T08:51:59.829632Z"
}
},
"cell_type": "code",
"source": [
"#进入背题模式\n",
"clickable_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, \".el-icon-right.next\")))\n",
"clickable_element.click()\n",
"\n"
],
"id": "721f5a8a872bfdce",
"outputs": [],
"execution_count": 17
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-21T08:52:07.855382Z",
"start_time": "2025-08-21T08:52:07.834085Z"
}
},
"cell_type": "code",
"source": [
"# 获取渲染后的 HTML\n",
"rendered_html = driver.page_source"
],
"id": "aa728e660ee9bbe5",
"outputs": [],
"execution_count": 18
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-21T09:03:09.025560Z",
"start_time": "2025-08-21T09:03:08.973835Z"
}
},
"cell_type": "code",
"source": [
"soup = BeautifulSoup(rendered_html, 'html.parser')\n",
"\n",
"title = soup.find_all('p', class_='title')\n",
"\n",
"out_options_box = soup.find_all('div', class_='options-box')\n",
"\n",
"analyze = soup.find_all('div', class_='analyze')"
],
"id": "5db0bbd564c0b53f",
"outputs": [],
"execution_count": 33
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-21T09:03:10.108380Z",
"start_time": "2025-08-21T09:03:10.102301Z"
}
},
"cell_type": "code",
"source": "print(title)",
"id": "9ae9f13772cfed6a",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[<p class=\"title\" data-v-127dd018=\"\">根据《期货经营机构投资者适当性管理实施指引(试行)》,经营机构评估,划分所销售产品或者所提供服务的风险等级时,涉及投资组合的产品或服务的,下列表述中正确的是()。</p>]\n"
]
}
],
"execution_count": 34
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-21T09:11:05.144092Z",
"start_time": "2025-08-21T09:11:05.136517Z"
}
},
"cell_type": "code",
"source": [
"out_options_box = soup.find_all('div', class_='options-box')\n",
"\n",
"\n",
"if (len(out_options_box)!=1):\n",
" raise out_options_box\n",
"out_options = out_options_box[0].find_all(\"div\",class_=\"options-item\")\n",
"\n",
"for out_option in out_options:\n",
" abcd = out_option.find(\"p\",\"label\")\n",
"\n",
" trueFalse = False\n",
" if \"success-active\" in abcd.get(\"class\"):\n",
" trueFalse = True\n",
"\n",
" abcd = abcd.text.strip()\n",
"\n",
" answer = out_option.find(\"p\",\"text\").text.strip()\n",
"\n",
" print(abcd, answer, trueFalse)"
],
"id": "11d9051ab089122d",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"A 可以按照产品或服务对应的任何一个风险等级进行评估 False\n",
"B 应当按照产品或服务最低风险等级进行评估 False\n",
"C 应当按照产品或服务最高风险等级进行评估 False\n",
"D 应当按照产品或服务整体风险等级进行评估 True\n"
]
}
],
"execution_count": 39
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-21T08:55:04.344999Z",
"start_time": "2025-08-21T08:55:04.339191Z"
}
},
"cell_type": "code",
"source": "print(analyze)",
"id": "b7f43a482ce3c619",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[<div class=\"analyze\" data-v-24612254=\"\">涉及投资组合的产品或服务,应当按照产品或服务整体风险等级进行评估。 <br/><br/> </div>]\n"
]
}
],
"execution_count": 22
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "",
"id": "ad769b774bac8989"
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}