Files
cyzg/crawler/get_tianyi_1.ipynb
2025-08-22 10:04:00 +08:00

385 lines
11 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"id": "initial_id",
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2025-08-22T00:51:15.383830Z",
"start_time": "2025-08-22T00:51:15.162928Z"
}
},
"source": [
"import time\n",
"\n",
"from selenium import webdriver\n",
"from selenium.webdriver.edge.service import Service\n",
"from selenium.webdriver.common.by import By\n",
"from selenium.webdriver.support.ui import WebDriverWait\n",
"from selenium.webdriver.support import expected_conditions as EC\n",
"from selenium.webdriver.edge.options import Options"
],
"outputs": [],
"execution_count": 1
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-22T00:51:16.137665Z",
"start_time": "2025-08-22T00:51:16.016527Z"
}
},
"cell_type": "code",
"source": [
"from bs4 import BeautifulSoup\n",
"import sqlite3"
],
"id": "f184b255d5098302",
"outputs": [],
"execution_count": 2
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-22T01:24:38.259284Z",
"start_time": "2025-08-22T01:24:38.253051Z"
}
},
"cell_type": "code",
"source": [
"db_path = '../data.db'\n",
"conn = sqlite3.connect(db_path)"
],
"id": "4813fcf4dea28b8d",
"outputs": [],
"execution_count": 54
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-22T01:24:21.532983Z",
"start_time": "2025-08-22T01:24:21.528098Z"
}
},
"cell_type": "code",
"source": "# conn.close()",
"id": "8ea63e4cb82fe0c",
"outputs": [],
"execution_count": 53
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-22T00:51:16.185320Z",
"start_time": "2025-08-22T00:51:16.176280Z"
}
},
"cell_type": "code",
"source": [
"edge_options = Options()\n",
"#edge_options.add_argument(\"--headless\") # 可选:无界面模式\n",
"edge_options.add_argument(\"--disable-gpu\")\n",
"edge_options.add_argument(\"--no-sandbox\")\n",
"edge_options.add_argument(\"--disable-extensions\")\n",
"edge_options.add_argument(\"--disable-plugins\")\n",
"edge_options.add_argument(\"--disable-popup-blocking\")\n",
"edge_options.add_argument(\"--disable-infobars\")\n",
"edge_options.add_argument(\"--disable-notifications\")\n",
"edge_options.add_argument(\"--no-first-run\")\n",
"edge_options.add_argument(\"--no-default-browser-check\")\n",
"\n",
"user_data_dir = r\"D:\\code\\edge\"\n",
"edge_options.add_argument(f\"--user-data-dir={user_data_dir}\")\n",
"# 指定配置文件(可选,默认是 Default\n",
"edge_options.add_argument(\"--profile-directory=Default\")"
],
"id": "e5632e44a52d5dc4",
"outputs": [],
"execution_count": 4
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-22T00:51:18.395Z",
"start_time": "2025-08-22T00:51:16.198246Z"
}
},
"cell_type": "code",
"source": [
"# 指定 EdgeDriver 路径(可选,若已配置环境变量可省略)\n",
"service = Service(executable_path=r\"D:\\app\\edgeDriver\\msedgedriver.exe\")\n",
"# 创建 Edge 浏览器实例\n",
"driver = webdriver.Edge(service=service, options=edge_options)"
],
"id": "28b1479c3decc6b1",
"outputs": [],
"execution_count": 5
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-22T00:51:27.692312Z",
"start_time": "2025-08-22T00:51:18.413131Z"
}
},
"cell_type": "code",
"source": [
"driver.get(\"https://www.tianyiwangxiao.com/new/question-bank/learn-center-analyze/4d60c96ef05c452b812654e78af7701a/1957604601548296194?from=ht2\")\n",
"\n",
"\"https://www.tianyiwangxiao.com/new/question-bank/learn-center-analyze/94cfba022e2f4c7ebbeaa400576b3a9a/1958703246433423361?from=ht2\"\n",
"\n",
"# 等待页面渲染完成(例如等待 body 加载)\n",
"wait = WebDriverWait(driver, 720)\n",
"wait.until(EC.presence_of_element_located((By.TAG_NAME, \"body\")))\n",
"time.sleep(3)\n",
"\n"
],
"id": "779f88e1c3670c02",
"outputs": [],
"execution_count": 6
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-22T00:51:27.716520Z",
"start_time": "2025-08-22T00:51:27.708786Z"
}
},
"cell_type": "code",
"source": [
"def next_page():\n",
" clickable_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, \".el-icon-right.next\")))\n",
" clickable_element.click()\n",
"\n",
"def get_html():\n",
" rendered_html = driver.page_source\n",
" return rendered_html"
],
"id": "721f5a8a872bfdce",
"outputs": [],
"execution_count": 7
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-22T01:24:45.201186Z",
"start_time": "2025-08-22T01:24:45.184772Z"
}
},
"cell_type": "code",
"source": [
"def html_parser(rendered_html):\n",
" soup = BeautifulSoup(rendered_html, 'html.parser')\n",
"\n",
" title = soup.find_all('p', class_='title')\n",
"\n",
" out_options_box = soup.find_all('div', class_='options-box')\n",
"\n",
" analyze = soup.find_all('div', class_='analyze')\n",
"\n",
"\n",
" result={\"title\":title[0].text.strip(), \"analyze\":analyze[0].text.strip()}\n",
"\n",
" if (len(out_options_box)!=1):\n",
" raise out_options_box\n",
" out_options = out_options_box[0].find_all(\"div\",class_=\"options-item\")\n",
"\n",
" if len(out_options)==4:\n",
" # 多选\n",
" for out_option in out_options:\n",
"\n",
" abcd = out_option.find(\"p\",\"label\")\n",
"\n",
" trueFalse = False\n",
" if \"success-active\" in abcd.get(\"class\"):\n",
" trueFalse = True\n",
"\n",
" abcd = abcd.text.strip().lower()\n",
"\n",
" answer = out_option.find(\"p\",\"text\").text.strip()\n",
"\n",
" result[abcd] = [answer, trueFalse]\n",
" else:\n",
" # 单选\n",
" def get_tf():\n",
" out_options_box = soup.find_all('div', class_='answer-box')\n",
" for i in out_options_box:\n",
" for ii in i.find_all('div', class_='CORRECT'):\n",
" if ii.text == \"正确\":\n",
" return True\n",
" elif ii.text == \"错误\":\n",
" return False\n",
" print(out_options_box)\n",
" return 0\n",
" result[\"tf\"] = get_tf()\n",
"\n",
" return result\n",
"\n"
],
"id": "5db0bbd564c0b53f",
"outputs": [],
"execution_count": 55
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-22T01:41:12.394198Z",
"start_time": "2025-08-22T01:41:12.386634Z"
}
},
"cell_type": "code",
"source": [
"def write2db(index, result):\n",
"\n",
" if \"tf\" not in result:\n",
"\n",
" conn.execute(\n",
" \"INSERT INTO questions (title, chapter, q_num, q_type, question, a, b, c, d, a_result, b_result, c_result, d_result, explanation) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)\",\n",
" (\n",
" info[0],\n",
" info[1],\n",
" index,\n",
" \"多选题\",\n",
" result.get(\"title\"),\n",
" result.get(\"a\")[0],\n",
" result.get(\"b\")[0],\n",
" result.get(\"c\")[0],\n",
" result.get(\"d\")[0],\n",
" result.get(\"a\")[1],\n",
" result.get(\"b\")[1],\n",
" result.get(\"c\")[1],\n",
" result.get(\"d\")[1],\n",
" result.get(\"analyze\"),\n",
" )\n",
" )\n",
"\n",
" else:\n",
" if result[\"tf\"] == 0:\n",
" return\n",
"\n",
" conn.execute(\n",
" \"INSERT INTO questions (title, chapter, q_num, q_type, question, a, b, c, d, a_result, b_result, c_result, d_result, explanation) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)\",\n",
" (\n",
" info[0],\n",
" info[1],\n",
" index,\n",
" \"判断题\",\n",
" result.get(\"title\"),\n",
" \"\",\n",
" \"\",\n",
" \"\",\n",
" \"\",\n",
" 1 if result[\"tf\"] else 0,\n",
" 0 if result[\"tf\"] else 1,\n",
" \"\",\n",
" \"\",\n",
" result.get(\"analyze\"),\n",
" )\n",
" )\n",
"\n",
" conn.commit()"
],
"id": "853f278c1123cae1",
"outputs": [],
"execution_count": 69
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-22T01:46:12.973092Z",
"start_time": "2025-08-22T01:46:12.968961Z"
}
},
"cell_type": "code",
"source": "info = [\"天一\",0,130]",
"id": "71ef002122c67647",
"outputs": [],
"execution_count": 81
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-22T01:50:44.170181Z",
"start_time": "2025-08-22T01:50:27.773842Z"
}
},
"cell_type": "code",
"source": [
"for i in range(info[2]):\n",
" p = get_html()\n",
" result = html_parser(p)\n",
" write2db(i, result)\n",
" next_page()\n",
"\n"
],
"id": "11d9051ab089122d",
"outputs": [],
"execution_count": 89
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-22T01:10:27.088143Z",
"start_time": "2025-08-22T01:10:27.076521Z"
}
},
"cell_type": "code",
"source": "",
"id": "ad769b774bac8989",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<div class=\"answer CORRECT\" data-v-ee229d58=\"\">正确</div>\n"
]
}
],
"execution_count": 40
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-22T01:06:54.877198Z",
"start_time": "2025-08-22T01:06:54.867849Z"
}
},
"cell_type": "code",
"source": "",
"id": "e1474fd283674850",
"outputs": [],
"execution_count": 35
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "",
"id": "54ad268f864e1f6c"
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}