Files
cyzg/crawler/getLeCheng_chapter.ipynb
2025-08-20 16:25:51 +08:00

389 lines
12 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"id": "initial_id",
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2025-08-19T01:03:22.758571Z",
"start_time": "2025-08-19T01:03:22.753008Z"
}
},
"source": [
"import time\n",
"\n",
"from selenium import webdriver\n",
"from selenium.webdriver.edge.service import Service\n",
"from selenium.webdriver.common.by import By\n",
"from selenium.webdriver.support.ui import WebDriverWait\n",
"from selenium.webdriver.support import expected_conditions as EC\n",
"from selenium.webdriver.edge.options import Options"
],
"outputs": [],
"execution_count": 49
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T01:03:23.224371Z",
"start_time": "2025-08-19T01:03:23.220216Z"
}
},
"cell_type": "code",
"source": [
"from bs4 import BeautifulSoup\n",
"import sqlite3"
],
"id": "59b26d9f105eae85",
"outputs": [],
"execution_count": 50
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T01:03:25.179818Z",
"start_time": "2025-08-19T01:03:25.173558Z"
}
},
"cell_type": "code",
"source": "db_path = '../data.db'",
"id": "37a70656848ceced",
"outputs": [],
"execution_count": 51
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T01:03:25.713012Z",
"start_time": "2025-08-19T01:03:25.704775Z"
}
},
"cell_type": "code",
"source": [
"conn = sqlite3.connect(db_path)\n",
"conn.execute('''CREATE TABLE \"questions\"\n",
"(\n",
" id INTEGER\n",
" constraint questions_pk\n",
" primary key autoincrement,\n",
" title TEXT,\n",
" chapter TEXT,\n",
" q_num text,\n",
" q_type text,\n",
" question TEXT not null,\n",
" a TEXT not null,\n",
" b TEXT not null,\n",
" c TEXT not null,\n",
" d TEXT not null,\n",
" a_result BLOB default false,\n",
" b_result BLOB default false,\n",
" c_result BLOB default false,\n",
" d_result BLOB default false,\n",
" explanation TEXT,\n",
" count integer default 3 not null\n",
")''')\n",
"\n",
"conn.execute('''CREATE TABLE \"answers_history\"\n",
"(\n",
" id INTEGER not null\n",
" constraint answers_history__questions_id_fk\n",
" references questions,\n",
" time_used INTEGER,\n",
" state INTEGER,\n",
" time text default CURRENT_TIMESTAMP\n",
")''')\n",
"\n",
"conn.execute('''CREATE TABLE url\n",
" (\n",
" id INTEGER not null,\n",
" url TEXT\n",
" )''')\n",
"\n",
"\n",
"conn.commit()\n"
],
"id": "d70a270099e8b056",
"outputs": [],
"execution_count": 52
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T01:03:27.430817Z",
"start_time": "2025-08-19T01:03:27.423603Z"
}
},
"cell_type": "code",
"source": [
"edge_options = Options()\n",
"#edge_options.add_argument(\"--headless\") # 可选:无界面模式\n",
"edge_options.add_argument(\"--disable-gpu\")\n",
"edge_options.add_argument(\"--no-sandbox\")\n",
"edge_options.add_argument(\"--disable-extensions\")\n",
"edge_options.add_argument(\"--disable-plugins\")\n",
"edge_options.add_argument(\"--disable-popup-blocking\")\n",
"edge_options.add_argument(\"--disable-infobars\")\n",
"edge_options.add_argument(\"--disable-notifications\")\n",
"edge_options.add_argument(\"--no-first-run\")\n",
"edge_options.add_argument(\"--no-default-browser-check\")\n",
"\n",
"user_data_dir = r\"D:\\code\\edge\"\n",
"edge_options.add_argument(f\"--user-data-dir={user_data_dir}\")\n",
"# 指定配置文件(可选,默认是 Default\n",
"edge_options.add_argument(\"--profile-directory=Default\")"
],
"id": "e4a35062c4549f44",
"outputs": [],
"execution_count": 53
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T01:03:30.978615Z",
"start_time": "2025-08-19T01:03:28.414779Z"
}
},
"cell_type": "code",
"source": [
"# 指定 EdgeDriver 路径(可选,若已配置环境变量可省略)\n",
"service = Service(executable_path=r\"D:\\app\\edgeDriver\\msedgedriver.exe\")\n",
"# 创建 Edge 浏览器实例\n",
"driver = webdriver.Edge(service=service, options=edge_options)"
],
"id": "9b48ddaca80598aa",
"outputs": [],
"execution_count": 54
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T01:00:42.177993Z",
"start_time": "2025-08-19T01:00:42.171173Z"
}
},
"cell_type": "code",
"source": [
"def get_web(url):\n",
" driver.get(url)\n",
"\n",
" # 等待页面渲染完成(例如等待 body 加载)\n",
" wait = WebDriverWait(driver, 720)\n",
" wait.until(EC.presence_of_element_located((By.TAG_NAME, \"body\")))\n",
" time.sleep(3)\n",
"\n",
" #进入背题模式\n",
" clickable_element = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, \".setting-type.iconfont.icon-setting\")))\n",
" clickable_element.click()\n",
" wait.until(\n",
" EC.element_to_be_clickable((By.CSS_SELECTOR, \".question-setting-button.ant-btn.ant-btn-default\"))).click()\n",
"\n",
" # 获取渲染后的 HTML\n",
" rendered_html = driver.page_source\n",
" return rendered_html"
],
"id": "2b02063fec8abbdd",
"outputs": [],
"execution_count": 43
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T01:00:42.212116Z",
"start_time": "2025-08-19T01:00:42.206610Z"
}
},
"cell_type": "code",
"source": [
"def list_get(lst, index, default=\"\"):\n",
" try:\n",
" return lst[index]\n",
" except IndexError:\n",
" return default"
],
"id": "de9650bb0e005d4a",
"outputs": [],
"execution_count": 44
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T01:00:42.247110Z",
"start_time": "2025-08-19T01:00:42.237114Z"
}
},
"cell_type": "code",
"source": [
"def write2db(rendered_html, info):\n",
" # 解析web并登记\n",
" soup = BeautifulSoup(rendered_html, 'html.parser')\n",
" full_left = soup.find_all('div', class_='pull-left')\n",
" for questions in full_left:\n",
" for question in questions.children:\n",
" title_info = [] #num,type,question\n",
" answers_info = []\n",
" answers_correct_info = []\n",
" explain_info = \"\"\n",
"\n",
" # 标题信息\n",
" for title in question.find_all('div', class_='p-stem'):\n",
" for element in title.children:\n",
" title_info.append(element.text)\n",
"\n",
" # 题目信息\n",
" for answer in question.find_all('div', class_='answer-ul'):\n",
" for element in answer.find_all(\"div\", recursive=False):\n",
" # 答案\n",
" if \"answer\" in element.get(\"class\"):\n",
" answers_correct_info.append(True)\n",
" else:\n",
" answers_correct_info.append(False)\n",
"\n",
" # 问题\n",
" text_elements = element.select(\"div > div > div > div > p\")\n",
" for text_element in text_elements:\n",
" if text_element.text is not None and text_element.text != \"\":\n",
" answers_info.append(text_element.text)\n",
"\n",
" # 解析\n",
" for explain in question.find_all('div', class_='practise-answer-text'):\n",
" explain_info += str(explain.get_text(strip=True))\n",
"\n",
" cursor = conn.execute(\n",
" \"INSERT INTO questions (title, chapter, q_num, q_type, question, a, b, c, d, a_result, b_result, c_result, d_result, explanation) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)\",\n",
" (\n",
" info[1],\n",
" info[2],\n",
" list_get(title_info,0),\n",
" list_get(title_info,1),\n",
" list_get(title_info,2),\n",
" list_get(answers_info,0),\n",
" list_get(answers_info,1),\n",
" list_get(answers_info,2),\n",
" list_get(answers_info,3),\n",
" list_get(answers_correct_info,0,False),\n",
" list_get(answers_correct_info,1,False),\n",
" list_get(answers_correct_info,2,False),\n",
" list_get(answers_correct_info,3,False),\n",
" explain_info,\n",
" )\n",
" )\n",
" inserted_id = cursor.lastrowid\n",
" conn.execute(\n",
" \"INSERT INTO url (id, url) VALUES (?, ?)\",\n",
" (inserted_id, info[0], )\n",
" )\n",
" conn.commit()"
],
"id": "c28a23cbd84f6ea0",
"outputs": [],
"execution_count": 45
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T01:00:42.273569Z",
"start_time": "2025-08-19T01:00:42.265569Z"
}
},
"cell_type": "code",
"source": [
"bg_infos = [\n",
" [\"期货乐橙章节练习\", 1, 1414, 1],\n",
" [\"期货乐橙章节练习\", 2, 1419, 3],\n",
" [\"期货乐橙章节练习\", 3, 1448, 2],\n",
" [\"期货乐橙章节练习\", 4, 1485, 2],\n",
" [\"期货乐橙章节练习\", 5, 1523, 2],\n",
" [\"期货乐橙章节练习\", 6, 1543, 2],\n",
"]"
],
"id": "f8ed3be15b2a69a7",
"outputs": [],
"execution_count": 46
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T01:00:42.311569Z",
"start_time": "2025-08-19T01:00:42.302568Z"
}
},
"cell_type": "code",
"source": [
"def main():\n",
" try:\n",
" for bg_info in bg_infos:\n",
" for index in range(1, bg_info[3]+1):\n",
" url = f\"https://www.bestlec.com/practise/practise?title=%E9%A1%BA%E5%BA%8F%E7%BB%83%E4%B9%A0&qBankId=39&qBankTitle=%E3%80%90%E6%9C%9F%E8%B4%A7%E6%B3%95%E8%A7%84%E3%80%91%E7%AB%A0%E8%8A%82%E7%BB%83%E4%B9%A0&chapterId={bg_info[2]}&practise=1&type=practise&selectSec={index}\"\n",
" rendered_html = get_web(url)\n",
" write2db(rendered_html, [url, bg_info[0], bg_info[1]])\n",
" except Exception as e:\n",
" print(\"error: \" + e)\n",
" finally:\n",
" try:\n",
" conn.close()\n",
" except Exception as e:\n",
" print(\"db:\", e)\n",
"\n",
" try:\n",
" driver.quit()\n",
" except Exception as e:\n",
" print(\"brother:\", e)"
],
"id": "fcfc560b46c29aaa",
"outputs": [],
"execution_count": 47
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T01:01:51.670165Z",
"start_time": "2025-08-19T01:00:42.337618Z"
}
},
"cell_type": "code",
"source": [
"if __name__ == '__main__':\n",
" main()"
],
"id": "811c9d3647c46f8b",
"outputs": [],
"execution_count": 48
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T01:01:51.740128Z",
"start_time": "2025-08-19T01:01:51.737199Z"
}
},
"cell_type": "code",
"source": "",
"id": "5224515d66fe0b",
"outputs": [],
"execution_count": null
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}