1
This commit is contained in:
388
crawler/getLeCheng_chapter.ipynb
Normal file
388
crawler/getLeCheng_chapter.ipynb
Normal file
@@ -0,0 +1,388 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"id": "initial_id",
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-08-19T01:03:22.758571Z",
|
||||
"start_time": "2025-08-19T01:03:22.753008Z"
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"import time\n",
|
||||
"\n",
|
||||
"from selenium import webdriver\n",
|
||||
"from selenium.webdriver.edge.service import Service\n",
|
||||
"from selenium.webdriver.common.by import By\n",
|
||||
"from selenium.webdriver.support.ui import WebDriverWait\n",
|
||||
"from selenium.webdriver.support import expected_conditions as EC\n",
|
||||
"from selenium.webdriver.edge.options import Options"
|
||||
],
|
||||
"outputs": [],
|
||||
"execution_count": 49
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-08-19T01:03:23.224371Z",
|
||||
"start_time": "2025-08-19T01:03:23.220216Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"from bs4 import BeautifulSoup\n",
|
||||
"import sqlite3"
|
||||
],
|
||||
"id": "59b26d9f105eae85",
|
||||
"outputs": [],
|
||||
"execution_count": 50
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-08-19T01:03:25.179818Z",
|
||||
"start_time": "2025-08-19T01:03:25.173558Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": "db_path = '../data.db'",
|
||||
"id": "37a70656848ceced",
|
||||
"outputs": [],
|
||||
"execution_count": 51
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-08-19T01:03:25.713012Z",
|
||||
"start_time": "2025-08-19T01:03:25.704775Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"conn = sqlite3.connect(db_path)\n",
|
||||
"conn.execute('''CREATE TABLE \"questions\"\n",
|
||||
"(\n",
|
||||
" id INTEGER\n",
|
||||
" constraint questions_pk\n",
|
||||
" primary key autoincrement,\n",
|
||||
" title TEXT,\n",
|
||||
" chapter TEXT,\n",
|
||||
" q_num text,\n",
|
||||
" q_type text,\n",
|
||||
" question TEXT not null,\n",
|
||||
" a TEXT not null,\n",
|
||||
" b TEXT not null,\n",
|
||||
" c TEXT not null,\n",
|
||||
" d TEXT not null,\n",
|
||||
" a_result BLOB default false,\n",
|
||||
" b_result BLOB default false,\n",
|
||||
" c_result BLOB default false,\n",
|
||||
" d_result BLOB default false,\n",
|
||||
" explanation TEXT,\n",
|
||||
" count integer default 3 not null\n",
|
||||
")''')\n",
|
||||
"\n",
|
||||
"conn.execute('''CREATE TABLE \"answers_history\"\n",
|
||||
"(\n",
|
||||
" id INTEGER not null\n",
|
||||
" constraint answers_history__questions_id_fk\n",
|
||||
" references questions,\n",
|
||||
" time_used INTEGER,\n",
|
||||
" state INTEGER,\n",
|
||||
" time text default CURRENT_TIMESTAMP\n",
|
||||
")''')\n",
|
||||
"\n",
|
||||
"conn.execute('''CREATE TABLE url\n",
|
||||
" (\n",
|
||||
" id INTEGER not null,\n",
|
||||
" url TEXT\n",
|
||||
" )''')\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"conn.commit()\n"
|
||||
],
|
||||
"id": "d70a270099e8b056",
|
||||
"outputs": [],
|
||||
"execution_count": 52
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-08-19T01:03:27.430817Z",
|
||||
"start_time": "2025-08-19T01:03:27.423603Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"edge_options = Options()\n",
|
||||
"#edge_options.add_argument(\"--headless\") # 可选:无界面模式\n",
|
||||
"edge_options.add_argument(\"--disable-gpu\")\n",
|
||||
"edge_options.add_argument(\"--no-sandbox\")\n",
|
||||
"edge_options.add_argument(\"--disable-extensions\")\n",
|
||||
"edge_options.add_argument(\"--disable-plugins\")\n",
|
||||
"edge_options.add_argument(\"--disable-popup-blocking\")\n",
|
||||
"edge_options.add_argument(\"--disable-infobars\")\n",
|
||||
"edge_options.add_argument(\"--disable-notifications\")\n",
|
||||
"edge_options.add_argument(\"--no-first-run\")\n",
|
||||
"edge_options.add_argument(\"--no-default-browser-check\")\n",
|
||||
"\n",
|
||||
"user_data_dir = r\"D:\\code\\edge\"\n",
|
||||
"edge_options.add_argument(f\"--user-data-dir={user_data_dir}\")\n",
|
||||
"# 指定配置文件(可选,默认是 Default)\n",
|
||||
"edge_options.add_argument(\"--profile-directory=Default\")"
|
||||
],
|
||||
"id": "e4a35062c4549f44",
|
||||
"outputs": [],
|
||||
"execution_count": 53
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-08-19T01:03:30.978615Z",
|
||||
"start_time": "2025-08-19T01:03:28.414779Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"# 指定 EdgeDriver 路径(可选,若已配置环境变量可省略)\n",
|
||||
"service = Service(executable_path=r\"D:\\app\\edgeDriver\\msedgedriver.exe\")\n",
|
||||
"# 创建 Edge 浏览器实例\n",
|
||||
"driver = webdriver.Edge(service=service, options=edge_options)"
|
||||
],
|
||||
"id": "9b48ddaca80598aa",
|
||||
"outputs": [],
|
||||
"execution_count": 54
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-08-19T01:00:42.177993Z",
|
||||
"start_time": "2025-08-19T01:00:42.171173Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"def get_web(url):\n",
|
||||
" driver.get(url)\n",
|
||||
"\n",
|
||||
" # 等待页面渲染完成(例如等待 body 加载)\n",
|
||||
" wait = WebDriverWait(driver, 720)\n",
|
||||
" wait.until(EC.presence_of_element_located((By.TAG_NAME, \"body\")))\n",
|
||||
" time.sleep(3)\n",
|
||||
"\n",
|
||||
" #进入背题模式\n",
|
||||
" clickable_element = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, \".setting-type.iconfont.icon-setting\")))\n",
|
||||
" clickable_element.click()\n",
|
||||
" wait.until(\n",
|
||||
" EC.element_to_be_clickable((By.CSS_SELECTOR, \".question-setting-button.ant-btn.ant-btn-default\"))).click()\n",
|
||||
"\n",
|
||||
" # 获取渲染后的 HTML\n",
|
||||
" rendered_html = driver.page_source\n",
|
||||
" return rendered_html"
|
||||
],
|
||||
"id": "2b02063fec8abbdd",
|
||||
"outputs": [],
|
||||
"execution_count": 43
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-08-19T01:00:42.212116Z",
|
||||
"start_time": "2025-08-19T01:00:42.206610Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"def list_get(lst, index, default=\"\"):\n",
|
||||
" try:\n",
|
||||
" return lst[index]\n",
|
||||
" except IndexError:\n",
|
||||
" return default"
|
||||
],
|
||||
"id": "de9650bb0e005d4a",
|
||||
"outputs": [],
|
||||
"execution_count": 44
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-08-19T01:00:42.247110Z",
|
||||
"start_time": "2025-08-19T01:00:42.237114Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"def write2db(rendered_html, info):\n",
|
||||
" # 解析web并登记\n",
|
||||
" soup = BeautifulSoup(rendered_html, 'html.parser')\n",
|
||||
" full_left = soup.find_all('div', class_='pull-left')\n",
|
||||
" for questions in full_left:\n",
|
||||
" for question in questions.children:\n",
|
||||
" title_info = [] #num,type,question\n",
|
||||
" answers_info = []\n",
|
||||
" answers_correct_info = []\n",
|
||||
" explain_info = \"\"\n",
|
||||
"\n",
|
||||
" # 标题信息\n",
|
||||
" for title in question.find_all('div', class_='p-stem'):\n",
|
||||
" for element in title.children:\n",
|
||||
" title_info.append(element.text)\n",
|
||||
"\n",
|
||||
" # 题目信息\n",
|
||||
" for answer in question.find_all('div', class_='answer-ul'):\n",
|
||||
" for element in answer.find_all(\"div\", recursive=False):\n",
|
||||
" # 答案\n",
|
||||
" if \"answer\" in element.get(\"class\"):\n",
|
||||
" answers_correct_info.append(True)\n",
|
||||
" else:\n",
|
||||
" answers_correct_info.append(False)\n",
|
||||
"\n",
|
||||
" # 问题\n",
|
||||
" text_elements = element.select(\"div > div > div > div > p\")\n",
|
||||
" for text_element in text_elements:\n",
|
||||
" if text_element.text is not None and text_element.text != \"\":\n",
|
||||
" answers_info.append(text_element.text)\n",
|
||||
"\n",
|
||||
" # 解析\n",
|
||||
" for explain in question.find_all('div', class_='practise-answer-text'):\n",
|
||||
" explain_info += str(explain.get_text(strip=True))\n",
|
||||
"\n",
|
||||
" cursor = conn.execute(\n",
|
||||
" \"INSERT INTO questions (title, chapter, q_num, q_type, question, a, b, c, d, a_result, b_result, c_result, d_result, explanation) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)\",\n",
|
||||
" (\n",
|
||||
" info[1],\n",
|
||||
" info[2],\n",
|
||||
" list_get(title_info,0),\n",
|
||||
" list_get(title_info,1),\n",
|
||||
" list_get(title_info,2),\n",
|
||||
" list_get(answers_info,0),\n",
|
||||
" list_get(answers_info,1),\n",
|
||||
" list_get(answers_info,2),\n",
|
||||
" list_get(answers_info,3),\n",
|
||||
" list_get(answers_correct_info,0,False),\n",
|
||||
" list_get(answers_correct_info,1,False),\n",
|
||||
" list_get(answers_correct_info,2,False),\n",
|
||||
" list_get(answers_correct_info,3,False),\n",
|
||||
" explain_info,\n",
|
||||
" )\n",
|
||||
" )\n",
|
||||
" inserted_id = cursor.lastrowid\n",
|
||||
" conn.execute(\n",
|
||||
" \"INSERT INTO url (id, url) VALUES (?, ?)\",\n",
|
||||
" (inserted_id, info[0], )\n",
|
||||
" )\n",
|
||||
" conn.commit()"
|
||||
],
|
||||
"id": "c28a23cbd84f6ea0",
|
||||
"outputs": [],
|
||||
"execution_count": 45
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-08-19T01:00:42.273569Z",
|
||||
"start_time": "2025-08-19T01:00:42.265569Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"bg_infos = [\n",
|
||||
" [\"期货乐橙章节练习\", 1, 1414, 1],\n",
|
||||
" [\"期货乐橙章节练习\", 2, 1419, 3],\n",
|
||||
" [\"期货乐橙章节练习\", 3, 1448, 2],\n",
|
||||
" [\"期货乐橙章节练习\", 4, 1485, 2],\n",
|
||||
" [\"期货乐橙章节练习\", 5, 1523, 2],\n",
|
||||
" [\"期货乐橙章节练习\", 6, 1543, 2],\n",
|
||||
"]"
|
||||
],
|
||||
"id": "f8ed3be15b2a69a7",
|
||||
"outputs": [],
|
||||
"execution_count": 46
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-08-19T01:00:42.311569Z",
|
||||
"start_time": "2025-08-19T01:00:42.302568Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"def main():\n",
|
||||
" try:\n",
|
||||
" for bg_info in bg_infos:\n",
|
||||
" for index in range(1, bg_info[3]+1):\n",
|
||||
" url = f\"https://www.bestlec.com/practise/practise?title=%E9%A1%BA%E5%BA%8F%E7%BB%83%E4%B9%A0&qBankId=39&qBankTitle=%E3%80%90%E6%9C%9F%E8%B4%A7%E6%B3%95%E8%A7%84%E3%80%91%E7%AB%A0%E8%8A%82%E7%BB%83%E4%B9%A0&chapterId={bg_info[2]}&practise=1&type=practise&selectSec={index}\"\n",
|
||||
" rendered_html = get_web(url)\n",
|
||||
" write2db(rendered_html, [url, bg_info[0], bg_info[1]])\n",
|
||||
" except Exception as e:\n",
|
||||
" print(\"error: \" + e)\n",
|
||||
" finally:\n",
|
||||
" try:\n",
|
||||
" conn.close()\n",
|
||||
" except Exception as e:\n",
|
||||
" print(\"db:\", e)\n",
|
||||
"\n",
|
||||
" try:\n",
|
||||
" driver.quit()\n",
|
||||
" except Exception as e:\n",
|
||||
" print(\"brother:\", e)"
|
||||
],
|
||||
"id": "fcfc560b46c29aaa",
|
||||
"outputs": [],
|
||||
"execution_count": 47
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-08-19T01:01:51.670165Z",
|
||||
"start_time": "2025-08-19T01:00:42.337618Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"if __name__ == '__main__':\n",
|
||||
" main()"
|
||||
],
|
||||
"id": "811c9d3647c46f8b",
|
||||
"outputs": [],
|
||||
"execution_count": 48
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-08-19T01:01:51.740128Z",
|
||||
"start_time": "2025-08-19T01:01:51.737199Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": "",
|
||||
"id": "5224515d66fe0b",
|
||||
"outputs": [],
|
||||
"execution_count": null
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 2
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython2",
|
||||
"version": "2.7.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Reference in New Issue
Block a user