{ "cells": [ { "cell_type": "code", "id": "initial_id", "metadata": { "collapsed": true, "ExecuteTime": { "end_time": "2025-08-19T01:03:22.758571Z", "start_time": "2025-08-19T01:03:22.753008Z" } }, "source": [ "import time\n", "\n", "from selenium import webdriver\n", "from selenium.webdriver.edge.service import Service\n", "from selenium.webdriver.common.by import By\n", "from selenium.webdriver.support.ui import WebDriverWait\n", "from selenium.webdriver.support import expected_conditions as EC\n", "from selenium.webdriver.edge.options import Options" ], "outputs": [], "execution_count": 49 }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-19T01:03:23.224371Z", "start_time": "2025-08-19T01:03:23.220216Z" } }, "cell_type": "code", "source": [ "from bs4 import BeautifulSoup\n", "import sqlite3" ], "id": "59b26d9f105eae85", "outputs": [], "execution_count": 50 }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-19T01:03:25.179818Z", "start_time": "2025-08-19T01:03:25.173558Z" } }, "cell_type": "code", "source": "db_path = '../data.db'", "id": "37a70656848ceced", "outputs": [], "execution_count": 51 }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-19T01:03:25.713012Z", "start_time": "2025-08-19T01:03:25.704775Z" } }, "cell_type": "code", "source": [ "conn = sqlite3.connect(db_path)\n", "conn.execute('''CREATE TABLE \"questions\"\n", "(\n", " id INTEGER\n", " constraint questions_pk\n", " primary key autoincrement,\n", " title TEXT,\n", " chapter TEXT,\n", " q_num text,\n", " q_type text,\n", " question TEXT not null,\n", " a TEXT not null,\n", " b TEXT not null,\n", " c TEXT not null,\n", " d TEXT not null,\n", " a_result BLOB default false,\n", " b_result BLOB default false,\n", " c_result BLOB default false,\n", " d_result BLOB default false,\n", " explanation TEXT,\n", " count integer default 3 not null\n", ")''')\n", "\n", "conn.execute('''CREATE TABLE \"answers_history\"\n", "(\n", " id INTEGER not null\n", " constraint answers_history__questions_id_fk\n", " references questions,\n", " time_used INTEGER,\n", " state INTEGER,\n", " time text default CURRENT_TIMESTAMP\n", ")''')\n", "\n", "conn.execute('''CREATE TABLE url\n", " (\n", " id INTEGER not null,\n", " url TEXT\n", " )''')\n", "\n", "\n", "conn.commit()\n" ], "id": "d70a270099e8b056", "outputs": [], "execution_count": 52 }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-19T01:03:27.430817Z", "start_time": "2025-08-19T01:03:27.423603Z" } }, "cell_type": "code", "source": [ "edge_options = Options()\n", "#edge_options.add_argument(\"--headless\") # 可选:无界面模式\n", "edge_options.add_argument(\"--disable-gpu\")\n", "edge_options.add_argument(\"--no-sandbox\")\n", "edge_options.add_argument(\"--disable-extensions\")\n", "edge_options.add_argument(\"--disable-plugins\")\n", "edge_options.add_argument(\"--disable-popup-blocking\")\n", "edge_options.add_argument(\"--disable-infobars\")\n", "edge_options.add_argument(\"--disable-notifications\")\n", "edge_options.add_argument(\"--no-first-run\")\n", "edge_options.add_argument(\"--no-default-browser-check\")\n", "\n", "user_data_dir = r\"D:\\code\\edge\"\n", "edge_options.add_argument(f\"--user-data-dir={user_data_dir}\")\n", "# 指定配置文件(可选,默认是 Default)\n", "edge_options.add_argument(\"--profile-directory=Default\")" ], "id": "e4a35062c4549f44", "outputs": [], "execution_count": 53 }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-19T01:03:30.978615Z", "start_time": "2025-08-19T01:03:28.414779Z" } }, "cell_type": "code", "source": [ "# 指定 EdgeDriver 路径(可选,若已配置环境变量可省略)\n", "service = Service(executable_path=r\"D:\\app\\edgeDriver\\msedgedriver.exe\")\n", "# 创建 Edge 浏览器实例\n", "driver = webdriver.Edge(service=service, options=edge_options)" ], "id": "9b48ddaca80598aa", "outputs": [], "execution_count": 54 }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-19T01:00:42.177993Z", "start_time": "2025-08-19T01:00:42.171173Z" } }, "cell_type": "code", "source": [ "def get_web(url):\n", " driver.get(url)\n", "\n", " # 等待页面渲染完成(例如等待 body 加载)\n", " wait = WebDriverWait(driver, 720)\n", " wait.until(EC.presence_of_element_located((By.TAG_NAME, \"body\")))\n", " time.sleep(3)\n", "\n", " #进入背题模式\n", " clickable_element = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, \".setting-type.iconfont.icon-setting\")))\n", " clickable_element.click()\n", " wait.until(\n", " EC.element_to_be_clickable((By.CSS_SELECTOR, \".question-setting-button.ant-btn.ant-btn-default\"))).click()\n", "\n", " # 获取渲染后的 HTML\n", " rendered_html = driver.page_source\n", " return rendered_html" ], "id": "2b02063fec8abbdd", "outputs": [], "execution_count": 43 }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-19T01:00:42.212116Z", "start_time": "2025-08-19T01:00:42.206610Z" } }, "cell_type": "code", "source": [ "def list_get(lst, index, default=\"\"):\n", " try:\n", " return lst[index]\n", " except IndexError:\n", " return default" ], "id": "de9650bb0e005d4a", "outputs": [], "execution_count": 44 }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-19T01:00:42.247110Z", "start_time": "2025-08-19T01:00:42.237114Z" } }, "cell_type": "code", "source": [ "def write2db(rendered_html, info):\n", " # 解析web并登记\n", " soup = BeautifulSoup(rendered_html, 'html.parser')\n", " full_left = soup.find_all('div', class_='pull-left')\n", " for questions in full_left:\n", " for question in questions.children:\n", " title_info = [] #num,type,question\n", " answers_info = []\n", " answers_correct_info = []\n", " explain_info = \"\"\n", "\n", " # 标题信息\n", " for title in question.find_all('div', class_='p-stem'):\n", " for element in title.children:\n", " title_info.append(element.text)\n", "\n", " # 题目信息\n", " for answer in question.find_all('div', class_='answer-ul'):\n", " for element in answer.find_all(\"div\", recursive=False):\n", " # 答案\n", " if \"answer\" in element.get(\"class\"):\n", " answers_correct_info.append(True)\n", " else:\n", " answers_correct_info.append(False)\n", "\n", " # 问题\n", " text_elements = element.select(\"div > div > div > div > p\")\n", " for text_element in text_elements:\n", " if text_element.text is not None and text_element.text != \"\":\n", " answers_info.append(text_element.text)\n", "\n", " # 解析\n", " for explain in question.find_all('div', class_='practise-answer-text'):\n", " explain_info += str(explain.get_text(strip=True))\n", "\n", " cursor = conn.execute(\n", " \"INSERT INTO questions (title, chapter, q_num, q_type, question, a, b, c, d, a_result, b_result, c_result, d_result, explanation) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)\",\n", " (\n", " info[1],\n", " info[2],\n", " list_get(title_info,0),\n", " list_get(title_info,1),\n", " list_get(title_info,2),\n", " list_get(answers_info,0),\n", " list_get(answers_info,1),\n", " list_get(answers_info,2),\n", " list_get(answers_info,3),\n", " list_get(answers_correct_info,0,False),\n", " list_get(answers_correct_info,1,False),\n", " list_get(answers_correct_info,2,False),\n", " list_get(answers_correct_info,3,False),\n", " explain_info,\n", " )\n", " )\n", " inserted_id = cursor.lastrowid\n", " conn.execute(\n", " \"INSERT INTO url (id, url) VALUES (?, ?)\",\n", " (inserted_id, info[0], )\n", " )\n", " conn.commit()" ], "id": "c28a23cbd84f6ea0", "outputs": [], "execution_count": 45 }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-19T01:00:42.273569Z", "start_time": "2025-08-19T01:00:42.265569Z" } }, "cell_type": "code", "source": [ "bg_infos = [\n", " [\"期货乐橙章节练习\", 1, 1414, 1],\n", " [\"期货乐橙章节练习\", 2, 1419, 3],\n", " [\"期货乐橙章节练习\", 3, 1448, 2],\n", " [\"期货乐橙章节练习\", 4, 1485, 2],\n", " [\"期货乐橙章节练习\", 5, 1523, 2],\n", " [\"期货乐橙章节练习\", 6, 1543, 2],\n", "]" ], "id": "f8ed3be15b2a69a7", "outputs": [], "execution_count": 46 }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-19T01:00:42.311569Z", "start_time": "2025-08-19T01:00:42.302568Z" } }, "cell_type": "code", "source": [ "def main():\n", " try:\n", " for bg_info in bg_infos:\n", " for index in range(1, bg_info[3]+1):\n", " url = f\"https://www.bestlec.com/practise/practise?title=%E9%A1%BA%E5%BA%8F%E7%BB%83%E4%B9%A0&qBankId=39&qBankTitle=%E3%80%90%E6%9C%9F%E8%B4%A7%E6%B3%95%E8%A7%84%E3%80%91%E7%AB%A0%E8%8A%82%E7%BB%83%E4%B9%A0&chapterId={bg_info[2]}&practise=1&type=practise&selectSec={index}\"\n", " rendered_html = get_web(url)\n", " write2db(rendered_html, [url, bg_info[0], bg_info[1]])\n", " except Exception as e:\n", " print(\"error: \" + e)\n", " finally:\n", " try:\n", " conn.close()\n", " except Exception as e:\n", " print(\"db:\", e)\n", "\n", " try:\n", " driver.quit()\n", " except Exception as e:\n", " print(\"brother:\", e)" ], "id": "fcfc560b46c29aaa", "outputs": [], "execution_count": 47 }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-19T01:01:51.670165Z", "start_time": "2025-08-19T01:00:42.337618Z" } }, "cell_type": "code", "source": [ "if __name__ == '__main__':\n", " main()" ], "id": "811c9d3647c46f8b", "outputs": [], "execution_count": 48 }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-19T01:01:51.740128Z", "start_time": "2025-08-19T01:01:51.737199Z" } }, "cell_type": "code", "source": "", "id": "5224515d66fe0b", "outputs": [], "execution_count": null } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }