{ "cells": [ { "cell_type": "code", "id": "initial_id", "metadata": { "collapsed": true, "ExecuteTime": { "end_time": "2025-08-19T02:05:36.929261Z", "start_time": "2025-08-19T02:05:36.924419Z" } }, "source": [ "import time\n", "\n", "from selenium import webdriver\n", "from selenium.webdriver.edge.service import Service\n", "from selenium.webdriver.common.by import By\n", "from selenium.webdriver.support.ui import WebDriverWait\n", "from selenium.webdriver.support import expected_conditions as EC\n", "from selenium.webdriver.edge.options import Options" ], "outputs": [], "execution_count": 86 }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-19T02:05:37.477084Z", "start_time": "2025-08-19T02:05:37.467743Z" } }, "cell_type": "code", "source": [ "from bs4 import BeautifulSoup\n", "import sqlite3" ], "id": "59b26d9f105eae85", "outputs": [], "execution_count": 87 }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-19T02:05:38.051715Z", "start_time": "2025-08-19T02:05:38.047420Z" } }, "cell_type": "code", "source": "db_path = '../data.db'", "id": "37a70656848ceced", "outputs": [], "execution_count": 88 }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-19T02:05:38.522754Z", "start_time": "2025-08-19T02:05:38.515572Z" } }, "cell_type": "code", "source": [ "conn = sqlite3.connect(db_path)\n", "conn.execute('''CREATE TABLE \"questions\"\n", "(\n", " id INTEGER\n", " constraint questions_pk\n", " primary key autoincrement,\n", " title TEXT,\n", " chapter TEXT,\n", " q_num text,\n", " q_type text,\n", " question TEXT not null,\n", " a TEXT not null,\n", " b TEXT not null,\n", " c TEXT not null,\n", " d TEXT not null,\n", " a_result BLOB default false,\n", " b_result BLOB default false,\n", " c_result BLOB default false,\n", " d_result BLOB default false,\n", " explanation TEXT,\n", " count integer default 3 not null\n", ")''')\n", "\n", "conn.execute('''CREATE TABLE \"answers_history\"\n", "(\n", " id INTEGER not null\n", " constraint answers_history__questions_id_fk\n", " references questions,\n", " time_used INTEGER,\n", " state INTEGER,\n", " time text default CURRENT_TIMESTAMP\n", ")''')\n", "\n", "conn.execute('''CREATE TABLE url\n", " (\n", " id INTEGER not null,\n", " url TEXT\n", " )''')\n", "\n", "\n", "conn.commit()\n" ], "id": "d70a270099e8b056", "outputs": [], "execution_count": 89 }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-19T02:05:39.660112Z", "start_time": "2025-08-19T02:05:39.654188Z" } }, "cell_type": "code", "source": [ "edge_options = Options()\n", "#edge_options.add_argument(\"--headless\") # 可选:无界面模式\n", "edge_options.add_argument(\"--disable-gpu\")\n", "edge_options.add_argument(\"--no-sandbox\")\n", "edge_options.add_argument(\"--disable-extensions\")\n", "edge_options.add_argument(\"--disable-plugins\")\n", "edge_options.add_argument(\"--disable-popup-blocking\")\n", "edge_options.add_argument(\"--disable-infobars\")\n", "edge_options.add_argument(\"--disable-notifications\")\n", "edge_options.add_argument(\"--no-first-run\")\n", "edge_options.add_argument(\"--no-default-browser-check\")\n", "\n", "user_data_dir = r\"D:\\code\\edge\"\n", "edge_options.add_argument(f\"--user-data-dir={user_data_dir}\")\n", "# 指定配置文件(可选,默认是 Default)\n", "edge_options.add_argument(\"--profile-directory=Default\")" ], "id": "e4a35062c4549f44", "outputs": [], "execution_count": 90 }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-19T02:05:42.646764Z", "start_time": "2025-08-19T02:05:40.137721Z" } }, "cell_type": "code", "source": [ "# 指定 EdgeDriver 路径(可选,若已配置环境变量可省略)\n", "service = Service(executable_path=r\"D:\\app\\edgeDriver\\msedgedriver.exe\")\n", "# 创建 Edge 浏览器实例\n", "driver = webdriver.Edge(service=service, options=edge_options)" ], "id": "9b48ddaca80598aa", "outputs": [], "execution_count": 91 }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-19T01:58:29.792348Z", "start_time": "2025-08-19T01:58:29.786681Z" } }, "cell_type": "code", "source": [ "def get_web(url):\n", " driver.get(url)\n", "\n", " # 等待页面渲染完成(例如等待 body 加载)\n", " wait = WebDriverWait(driver, 720)\n", " wait.until(EC.presence_of_element_located((By.TAG_NAME, \"body\")))\n", " time.sleep(3)\n", "\n", " #进入背题模式\n", " # clickable_element = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, \".setting-type.iconfont.icon-setting\")))\n", " # clickable_element.click()\n", " # wait.until(\n", " # EC.element_to_be_clickable((By.CSS_SELECTOR, \".question-setting-button.ant-btn.ant-btn-default\"))).click()\n", "\n", " # 获取渲染后的 HTML\n", " rendered_html = driver.page_source\n", " return rendered_html" ], "id": "2b02063fec8abbdd", "outputs": [], "execution_count": 80 }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-19T01:58:30.323362Z", "start_time": "2025-08-19T01:58:30.317695Z" } }, "cell_type": "code", "source": [ "def list_get(lst, index, default=\"\"):\n", " try:\n", " return lst[index]\n", " except IndexError:\n", " return default" ], "id": "de9650bb0e005d4a", "outputs": [], "execution_count": 81 }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-19T01:58:30.884914Z", "start_time": "2025-08-19T01:58:30.871279Z" } }, "cell_type": "code", "source": [ "def write2db(rendered_html, info):\n", " # 解析web并登记\n", " soup = BeautifulSoup(rendered_html, 'html.parser')\n", " full_left = soup.find_all('div', class_='pull-left')\n", "\n", " for questions in full_left:\n", " for question in questions.children:\n", " title_info = [] #num,type,question\n", " answers_info = []\n", " answers_correct_info = [False,False,False,False,]\n", " explain_info = \"\"\n", "\n", " # 标题信息\n", " for title in question.find_all('div', class_='p-stem'):\n", " for element in title.children:\n", " title_info.append(element.text)\n", "\n", "\n", " # 题目信息\n", " for answer in question.find_all('div', class_='answerClass'):\n", " # 答案\n", "\n", "\n", " # 带选项\n", " text_elements = answer.select(\"div > div > p\")\n", " for text_element in text_elements:\n", " if text_element.text is not None and text_element.text != \"\":\n", " answers_info.append(text_element.text)\n", "\n", "\n", " # 解析\n", " explains = question.find_all('div', class_='practise-answer-text')\n", "\n", " try:\n", " for i in explains[0].text:\n", " match i:\n", " case \"A\":\n", " answers_correct_info[0] = True\n", " case \"B\":\n", " answers_correct_info[1] = True\n", " case \"C\":\n", " answers_correct_info[2] = True\n", " case \"D\":\n", " answers_correct_info[3] = True\n", " case _ :\n", " print(i)\n", " pass\n", " except Exception as e:\n", " # e.with_traceback()\n", " # print(title_info)\n", " pass\n", "\n", " if len(explains[0]) == 0:\n", " print(\"0 answers found!!\", title_info)\n", "\n", " for explain in explains:\n", " explain_info += str(explain.get_text(strip=True))\n", "\n", "\n", " cursor = conn.execute(\n", " \"INSERT INTO questions (title, chapter, q_num, q_type, question, a, b, c, d, a_result, b_result, c_result, d_result, explanation) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)\",\n", " (\n", " info[1],\n", " info[2],\n", " list_get(title_info,0),\n", " list_get(title_info,1),\n", " list_get(title_info,2),\n", " list_get(answers_info,0),\n", " list_get(answers_info,1),\n", " list_get(answers_info,2),\n", " list_get(answers_info,3),\n", " list_get(answers_correct_info,0,False),\n", " list_get(answers_correct_info,1,False),\n", " list_get(answers_correct_info,2,False),\n", " list_get(answers_correct_info,3,False),\n", " explain_info,\n", " )\n", " )\n", " inserted_id = cursor.lastrowid\n", " conn.execute(\n", " \"INSERT INTO url (id, url) VALUES (?, ?)\",\n", " (inserted_id, info[0], )\n", " )\n", " conn.commit()" ], "id": "c28a23cbd84f6ea0", "outputs": [], "execution_count": 82 }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-19T01:58:31.409810Z", "start_time": "2025-08-19T01:58:31.404300Z" } }, "cell_type": "code", "source": [ "ttt = \"期货乐橙假题\"\n", "bg_infos = [\n", " [ttt, 0, 1385, 1],\n", " [ttt, 0, 1392, 1],\n", " [ttt, 0, 1393, 1],\n", " [ttt, 0, 1394, 1],\n", " [ttt, 0, 1395, 1],\n", "]" ], "id": "f8ed3be15b2a69a7", "outputs": [], "execution_count": 83 }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-19T01:58:32.106767Z", "start_time": "2025-08-19T01:58:32.099456Z" } }, "cell_type": "code", "source": [ "def main():\n", " try:\n", " for bg_info in bg_infos:\n", " for index in range(1, bg_info[3]+1):\n", " #url = f\"https://www.bestlec.com/practise/practise?title=%E9%A1%BA%E5%BA%8F%E7%BB%83%E4%B9%A0&qBankId=39&qBankTitle=%E3%80%90%E6%9C%9F%E8%B4%A7%E6%B3%95%E8%A7%84%E3%80%91%E7%AB%A0%E8%8A%82%E7%BB%83%E4%B9%A0&chapterId={bg_info[2]}&practise=1&type=practise&selectSec={index}\"\n", "\n", " url = f\"https://www.bestlec.com/practise/practise?id={bg_info[2]}&qBankTitle=%E3%80%90%E6%9C%9F%E8%B4%A7%E6%B3%95%E8%A7%84%E3%80%91%E5%8E%86%E5%B9%B4%E7%9C%9F%E9%A2%98&title=%E7%9C%9F%E9%A2%98%E8%80%83%E8%AF%95&type=test&testType=search_paper\"\n", "\n", " rendered_html = get_web(url)\n", " write2db(rendered_html, [url, bg_info[0], bg_info[1]])\n", " except Exception as e:\n", " print(\"error: \" + e)\n", " finally:\n", " try:\n", " conn.close()\n", " except Exception as e:\n", " print(\"db:\", e)\n", "\n", " try:\n", " driver.quit()\n", " except Exception as e:\n", " print(\"brother:\", e)" ], "id": "fcfc560b46c29aaa", "outputs": [], "execution_count": 84 }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-19T01:59:02.604021Z", "start_time": "2025-08-19T01:58:32.772829Z" } }, "cell_type": "code", "source": [ "if __name__ == '__main__':\n", " main()" ], "id": "811c9d3647c46f8b", "outputs": [], "execution_count": 85 }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-19T01:59:03.138625Z", "start_time": "2025-08-19T01:59:03.135518Z" } }, "cell_type": "code", "source": "", "id": "5224515d66fe0b", "outputs": [], "execution_count": null } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }