{ "cells": [ { "cell_type": "code", "id": "initial_id", "metadata": { "collapsed": true, "ExecuteTime": { "end_time": "2025-08-22T00:51:15.383830Z", "start_time": "2025-08-22T00:51:15.162928Z" } }, "source": [ "import time\n", "\n", "from selenium import webdriver\n", "from selenium.webdriver.edge.service import Service\n", "from selenium.webdriver.common.by import By\n", "from selenium.webdriver.support.ui import WebDriverWait\n", "from selenium.webdriver.support import expected_conditions as EC\n", "from selenium.webdriver.edge.options import Options" ], "outputs": [], "execution_count": 1 }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-22T00:51:16.137665Z", "start_time": "2025-08-22T00:51:16.016527Z" } }, "cell_type": "code", "source": [ "from bs4 import BeautifulSoup\n", "import sqlite3" ], "id": "f184b255d5098302", "outputs": [], "execution_count": 2 }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-22T01:24:38.259284Z", "start_time": "2025-08-22T01:24:38.253051Z" } }, "cell_type": "code", "source": [ "db_path = '../data.db'\n", "conn = sqlite3.connect(db_path)" ], "id": "4813fcf4dea28b8d", "outputs": [], "execution_count": 54 }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-22T01:24:21.532983Z", "start_time": "2025-08-22T01:24:21.528098Z" } }, "cell_type": "code", "source": "# conn.close()", "id": "8ea63e4cb82fe0c", "outputs": [], "execution_count": 53 }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-22T00:51:16.185320Z", "start_time": "2025-08-22T00:51:16.176280Z" } }, "cell_type": "code", "source": [ "edge_options = Options()\n", "#edge_options.add_argument(\"--headless\") # 可选:无界面模式\n", "edge_options.add_argument(\"--disable-gpu\")\n", "edge_options.add_argument(\"--no-sandbox\")\n", "edge_options.add_argument(\"--disable-extensions\")\n", "edge_options.add_argument(\"--disable-plugins\")\n", "edge_options.add_argument(\"--disable-popup-blocking\")\n", "edge_options.add_argument(\"--disable-infobars\")\n", "edge_options.add_argument(\"--disable-notifications\")\n", "edge_options.add_argument(\"--no-first-run\")\n", "edge_options.add_argument(\"--no-default-browser-check\")\n", "\n", "user_data_dir = r\"D:\\code\\edge\"\n", "edge_options.add_argument(f\"--user-data-dir={user_data_dir}\")\n", "# 指定配置文件(可选,默认是 Default)\n", "edge_options.add_argument(\"--profile-directory=Default\")" ], "id": "e5632e44a52d5dc4", "outputs": [], "execution_count": 4 }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-22T00:51:18.395Z", "start_time": "2025-08-22T00:51:16.198246Z" } }, "cell_type": "code", "source": [ "# 指定 EdgeDriver 路径(可选,若已配置环境变量可省略)\n", "service = Service(executable_path=r\"D:\\app\\edgeDriver\\msedgedriver.exe\")\n", "# 创建 Edge 浏览器实例\n", "driver = webdriver.Edge(service=service, options=edge_options)" ], "id": "28b1479c3decc6b1", "outputs": [], "execution_count": 5 }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-22T00:51:27.692312Z", "start_time": "2025-08-22T00:51:18.413131Z" } }, "cell_type": "code", "source": [ "driver.get(\"https://www.tianyiwangxiao.com/new/question-bank/learn-center-analyze/4d60c96ef05c452b812654e78af7701a/1957604601548296194?from=ht2\")\n", "\n", "\"https://www.tianyiwangxiao.com/new/question-bank/learn-center-analyze/94cfba022e2f4c7ebbeaa400576b3a9a/1958703246433423361?from=ht2\"\n", "\n", "# 等待页面渲染完成(例如等待 body 加载)\n", "wait = WebDriverWait(driver, 720)\n", "wait.until(EC.presence_of_element_located((By.TAG_NAME, \"body\")))\n", "time.sleep(3)\n", "\n" ], "id": "779f88e1c3670c02", "outputs": [], "execution_count": 6 }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-22T00:51:27.716520Z", "start_time": "2025-08-22T00:51:27.708786Z" } }, "cell_type": "code", "source": [ "def next_page():\n", " clickable_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, \".el-icon-right.next\")))\n", " clickable_element.click()\n", "\n", "def get_html():\n", " rendered_html = driver.page_source\n", " return rendered_html" ], "id": "721f5a8a872bfdce", "outputs": [], "execution_count": 7 }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-22T01:24:45.201186Z", "start_time": "2025-08-22T01:24:45.184772Z" } }, "cell_type": "code", "source": [ "def html_parser(rendered_html):\n", " soup = BeautifulSoup(rendered_html, 'html.parser')\n", "\n", " title = soup.find_all('p', class_='title')\n", "\n", " out_options_box = soup.find_all('div', class_='options-box')\n", "\n", " analyze = soup.find_all('div', class_='analyze')\n", "\n", "\n", " result={\"title\":title[0].text.strip(), \"analyze\":analyze[0].text.strip()}\n", "\n", " if (len(out_options_box)!=1):\n", " raise out_options_box\n", " out_options = out_options_box[0].find_all(\"div\",class_=\"options-item\")\n", "\n", " if len(out_options)==4:\n", " # 多选\n", " for out_option in out_options:\n", "\n", " abcd = out_option.find(\"p\",\"label\")\n", "\n", " trueFalse = False\n", " if \"success-active\" in abcd.get(\"class\"):\n", " trueFalse = True\n", "\n", " abcd = abcd.text.strip().lower()\n", "\n", " answer = out_option.find(\"p\",\"text\").text.strip()\n", "\n", " result[abcd] = [answer, trueFalse]\n", " else:\n", " # 单选\n", " def get_tf():\n", " out_options_box = soup.find_all('div', class_='answer-box')\n", " for i in out_options_box:\n", " for ii in i.find_all('div', class_='CORRECT'):\n", " if ii.text == \"正确\":\n", " return True\n", " elif ii.text == \"错误\":\n", " return False\n", " print(out_options_box)\n", " return 0\n", " result[\"tf\"] = get_tf()\n", "\n", " return result\n", "\n" ], "id": "5db0bbd564c0b53f", "outputs": [], "execution_count": 55 }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-22T01:41:12.394198Z", "start_time": "2025-08-22T01:41:12.386634Z" } }, "cell_type": "code", "source": [ "def write2db(index, result):\n", "\n", " if \"tf\" not in result:\n", "\n", " conn.execute(\n", " \"INSERT INTO questions (title, chapter, q_num, q_type, question, a, b, c, d, a_result, b_result, c_result, d_result, explanation) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)\",\n", " (\n", " info[0],\n", " info[1],\n", " index,\n", " \"多选题\",\n", " result.get(\"title\"),\n", " result.get(\"a\")[0],\n", " result.get(\"b\")[0],\n", " result.get(\"c\")[0],\n", " result.get(\"d\")[0],\n", " result.get(\"a\")[1],\n", " result.get(\"b\")[1],\n", " result.get(\"c\")[1],\n", " result.get(\"d\")[1],\n", " result.get(\"analyze\"),\n", " )\n", " )\n", "\n", " else:\n", " if result[\"tf\"] == 0:\n", " return\n", "\n", " conn.execute(\n", " \"INSERT INTO questions (title, chapter, q_num, q_type, question, a, b, c, d, a_result, b_result, c_result, d_result, explanation) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)\",\n", " (\n", " info[0],\n", " info[1],\n", " index,\n", " \"判断题\",\n", " result.get(\"title\"),\n", " \"\",\n", " \"\",\n", " \"\",\n", " \"\",\n", " 1 if result[\"tf\"] else 0,\n", " 0 if result[\"tf\"] else 1,\n", " \"\",\n", " \"\",\n", " result.get(\"analyze\"),\n", " )\n", " )\n", "\n", " conn.commit()" ], "id": "853f278c1123cae1", "outputs": [], "execution_count": 69 }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-22T01:46:12.973092Z", "start_time": "2025-08-22T01:46:12.968961Z" } }, "cell_type": "code", "source": "info = [\"天一\",0,130]", "id": "71ef002122c67647", "outputs": [], "execution_count": 81 }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-22T01:50:44.170181Z", "start_time": "2025-08-22T01:50:27.773842Z" } }, "cell_type": "code", "source": [ "for i in range(info[2]):\n", " p = get_html()\n", " result = html_parser(p)\n", " write2db(i, result)\n", " next_page()\n", "\n" ], "id": "11d9051ab089122d", "outputs": [], "execution_count": 89 }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-22T01:10:27.088143Z", "start_time": "2025-08-22T01:10:27.076521Z" } }, "cell_type": "code", "source": "", "id": "ad769b774bac8989", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "