This commit is contained in:
2025-08-20 16:25:51 +08:00
commit c451fdf0e5
11 changed files with 1632 additions and 0 deletions

View File

@@ -0,0 +1,388 @@
{
"cells": [
{
"cell_type": "code",
"id": "initial_id",
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2025-08-19T01:03:22.758571Z",
"start_time": "2025-08-19T01:03:22.753008Z"
}
},
"source": [
"import time\n",
"\n",
"from selenium import webdriver\n",
"from selenium.webdriver.edge.service import Service\n",
"from selenium.webdriver.common.by import By\n",
"from selenium.webdriver.support.ui import WebDriverWait\n",
"from selenium.webdriver.support import expected_conditions as EC\n",
"from selenium.webdriver.edge.options import Options"
],
"outputs": [],
"execution_count": 49
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T01:03:23.224371Z",
"start_time": "2025-08-19T01:03:23.220216Z"
}
},
"cell_type": "code",
"source": [
"from bs4 import BeautifulSoup\n",
"import sqlite3"
],
"id": "59b26d9f105eae85",
"outputs": [],
"execution_count": 50
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T01:03:25.179818Z",
"start_time": "2025-08-19T01:03:25.173558Z"
}
},
"cell_type": "code",
"source": "db_path = '../data.db'",
"id": "37a70656848ceced",
"outputs": [],
"execution_count": 51
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T01:03:25.713012Z",
"start_time": "2025-08-19T01:03:25.704775Z"
}
},
"cell_type": "code",
"source": [
"conn = sqlite3.connect(db_path)\n",
"conn.execute('''CREATE TABLE \"questions\"\n",
"(\n",
" id INTEGER\n",
" constraint questions_pk\n",
" primary key autoincrement,\n",
" title TEXT,\n",
" chapter TEXT,\n",
" q_num text,\n",
" q_type text,\n",
" question TEXT not null,\n",
" a TEXT not null,\n",
" b TEXT not null,\n",
" c TEXT not null,\n",
" d TEXT not null,\n",
" a_result BLOB default false,\n",
" b_result BLOB default false,\n",
" c_result BLOB default false,\n",
" d_result BLOB default false,\n",
" explanation TEXT,\n",
" count integer default 3 not null\n",
")''')\n",
"\n",
"conn.execute('''CREATE TABLE \"answers_history\"\n",
"(\n",
" id INTEGER not null\n",
" constraint answers_history__questions_id_fk\n",
" references questions,\n",
" time_used INTEGER,\n",
" state INTEGER,\n",
" time text default CURRENT_TIMESTAMP\n",
")''')\n",
"\n",
"conn.execute('''CREATE TABLE url\n",
" (\n",
" id INTEGER not null,\n",
" url TEXT\n",
" )''')\n",
"\n",
"\n",
"conn.commit()\n"
],
"id": "d70a270099e8b056",
"outputs": [],
"execution_count": 52
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T01:03:27.430817Z",
"start_time": "2025-08-19T01:03:27.423603Z"
}
},
"cell_type": "code",
"source": [
"edge_options = Options()\n",
"#edge_options.add_argument(\"--headless\") # 可选:无界面模式\n",
"edge_options.add_argument(\"--disable-gpu\")\n",
"edge_options.add_argument(\"--no-sandbox\")\n",
"edge_options.add_argument(\"--disable-extensions\")\n",
"edge_options.add_argument(\"--disable-plugins\")\n",
"edge_options.add_argument(\"--disable-popup-blocking\")\n",
"edge_options.add_argument(\"--disable-infobars\")\n",
"edge_options.add_argument(\"--disable-notifications\")\n",
"edge_options.add_argument(\"--no-first-run\")\n",
"edge_options.add_argument(\"--no-default-browser-check\")\n",
"\n",
"user_data_dir = r\"D:\\code\\edge\"\n",
"edge_options.add_argument(f\"--user-data-dir={user_data_dir}\")\n",
"# 指定配置文件(可选,默认是 Default\n",
"edge_options.add_argument(\"--profile-directory=Default\")"
],
"id": "e4a35062c4549f44",
"outputs": [],
"execution_count": 53
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T01:03:30.978615Z",
"start_time": "2025-08-19T01:03:28.414779Z"
}
},
"cell_type": "code",
"source": [
"# 指定 EdgeDriver 路径(可选,若已配置环境变量可省略)\n",
"service = Service(executable_path=r\"D:\\app\\edgeDriver\\msedgedriver.exe\")\n",
"# 创建 Edge 浏览器实例\n",
"driver = webdriver.Edge(service=service, options=edge_options)"
],
"id": "9b48ddaca80598aa",
"outputs": [],
"execution_count": 54
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T01:00:42.177993Z",
"start_time": "2025-08-19T01:00:42.171173Z"
}
},
"cell_type": "code",
"source": [
"def get_web(url):\n",
" driver.get(url)\n",
"\n",
" # 等待页面渲染完成(例如等待 body 加载)\n",
" wait = WebDriverWait(driver, 720)\n",
" wait.until(EC.presence_of_element_located((By.TAG_NAME, \"body\")))\n",
" time.sleep(3)\n",
"\n",
" #进入背题模式\n",
" clickable_element = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, \".setting-type.iconfont.icon-setting\")))\n",
" clickable_element.click()\n",
" wait.until(\n",
" EC.element_to_be_clickable((By.CSS_SELECTOR, \".question-setting-button.ant-btn.ant-btn-default\"))).click()\n",
"\n",
" # 获取渲染后的 HTML\n",
" rendered_html = driver.page_source\n",
" return rendered_html"
],
"id": "2b02063fec8abbdd",
"outputs": [],
"execution_count": 43
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T01:00:42.212116Z",
"start_time": "2025-08-19T01:00:42.206610Z"
}
},
"cell_type": "code",
"source": [
"def list_get(lst, index, default=\"\"):\n",
" try:\n",
" return lst[index]\n",
" except IndexError:\n",
" return default"
],
"id": "de9650bb0e005d4a",
"outputs": [],
"execution_count": 44
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T01:00:42.247110Z",
"start_time": "2025-08-19T01:00:42.237114Z"
}
},
"cell_type": "code",
"source": [
"def write2db(rendered_html, info):\n",
" # 解析web并登记\n",
" soup = BeautifulSoup(rendered_html, 'html.parser')\n",
" full_left = soup.find_all('div', class_='pull-left')\n",
" for questions in full_left:\n",
" for question in questions.children:\n",
" title_info = [] #num,type,question\n",
" answers_info = []\n",
" answers_correct_info = []\n",
" explain_info = \"\"\n",
"\n",
" # 标题信息\n",
" for title in question.find_all('div', class_='p-stem'):\n",
" for element in title.children:\n",
" title_info.append(element.text)\n",
"\n",
" # 题目信息\n",
" for answer in question.find_all('div', class_='answer-ul'):\n",
" for element in answer.find_all(\"div\", recursive=False):\n",
" # 答案\n",
" if \"answer\" in element.get(\"class\"):\n",
" answers_correct_info.append(True)\n",
" else:\n",
" answers_correct_info.append(False)\n",
"\n",
" # 问题\n",
" text_elements = element.select(\"div > div > div > div > p\")\n",
" for text_element in text_elements:\n",
" if text_element.text is not None and text_element.text != \"\":\n",
" answers_info.append(text_element.text)\n",
"\n",
" # 解析\n",
" for explain in question.find_all('div', class_='practise-answer-text'):\n",
" explain_info += str(explain.get_text(strip=True))\n",
"\n",
" cursor = conn.execute(\n",
" \"INSERT INTO questions (title, chapter, q_num, q_type, question, a, b, c, d, a_result, b_result, c_result, d_result, explanation) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)\",\n",
" (\n",
" info[1],\n",
" info[2],\n",
" list_get(title_info,0),\n",
" list_get(title_info,1),\n",
" list_get(title_info,2),\n",
" list_get(answers_info,0),\n",
" list_get(answers_info,1),\n",
" list_get(answers_info,2),\n",
" list_get(answers_info,3),\n",
" list_get(answers_correct_info,0,False),\n",
" list_get(answers_correct_info,1,False),\n",
" list_get(answers_correct_info,2,False),\n",
" list_get(answers_correct_info,3,False),\n",
" explain_info,\n",
" )\n",
" )\n",
" inserted_id = cursor.lastrowid\n",
" conn.execute(\n",
" \"INSERT INTO url (id, url) VALUES (?, ?)\",\n",
" (inserted_id, info[0], )\n",
" )\n",
" conn.commit()"
],
"id": "c28a23cbd84f6ea0",
"outputs": [],
"execution_count": 45
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T01:00:42.273569Z",
"start_time": "2025-08-19T01:00:42.265569Z"
}
},
"cell_type": "code",
"source": [
"bg_infos = [\n",
" [\"期货乐橙章节练习\", 1, 1414, 1],\n",
" [\"期货乐橙章节练习\", 2, 1419, 3],\n",
" [\"期货乐橙章节练习\", 3, 1448, 2],\n",
" [\"期货乐橙章节练习\", 4, 1485, 2],\n",
" [\"期货乐橙章节练习\", 5, 1523, 2],\n",
" [\"期货乐橙章节练习\", 6, 1543, 2],\n",
"]"
],
"id": "f8ed3be15b2a69a7",
"outputs": [],
"execution_count": 46
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T01:00:42.311569Z",
"start_time": "2025-08-19T01:00:42.302568Z"
}
},
"cell_type": "code",
"source": [
"def main():\n",
" try:\n",
" for bg_info in bg_infos:\n",
" for index in range(1, bg_info[3]+1):\n",
" url = f\"https://www.bestlec.com/practise/practise?title=%E9%A1%BA%E5%BA%8F%E7%BB%83%E4%B9%A0&qBankId=39&qBankTitle=%E3%80%90%E6%9C%9F%E8%B4%A7%E6%B3%95%E8%A7%84%E3%80%91%E7%AB%A0%E8%8A%82%E7%BB%83%E4%B9%A0&chapterId={bg_info[2]}&practise=1&type=practise&selectSec={index}\"\n",
" rendered_html = get_web(url)\n",
" write2db(rendered_html, [url, bg_info[0], bg_info[1]])\n",
" except Exception as e:\n",
" print(\"error: \" + e)\n",
" finally:\n",
" try:\n",
" conn.close()\n",
" except Exception as e:\n",
" print(\"db:\", e)\n",
"\n",
" try:\n",
" driver.quit()\n",
" except Exception as e:\n",
" print(\"brother:\", e)"
],
"id": "fcfc560b46c29aaa",
"outputs": [],
"execution_count": 47
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T01:01:51.670165Z",
"start_time": "2025-08-19T01:00:42.337618Z"
}
},
"cell_type": "code",
"source": [
"if __name__ == '__main__':\n",
" main()"
],
"id": "811c9d3647c46f8b",
"outputs": [],
"execution_count": 48
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T01:01:51.740128Z",
"start_time": "2025-08-19T01:01:51.737199Z"
}
},
"cell_type": "code",
"source": "",
"id": "5224515d66fe0b",
"outputs": [],
"execution_count": null
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}