This commit is contained in:
2025-08-20 16:25:51 +08:00
commit c451fdf0e5
11 changed files with 1632 additions and 0 deletions

View File

@@ -0,0 +1,415 @@
{
"cells": [
{
"cell_type": "code",
"id": "initial_id",
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2025-08-19T02:05:36.929261Z",
"start_time": "2025-08-19T02:05:36.924419Z"
}
},
"source": [
"import time\n",
"\n",
"from selenium import webdriver\n",
"from selenium.webdriver.edge.service import Service\n",
"from selenium.webdriver.common.by import By\n",
"from selenium.webdriver.support.ui import WebDriverWait\n",
"from selenium.webdriver.support import expected_conditions as EC\n",
"from selenium.webdriver.edge.options import Options"
],
"outputs": [],
"execution_count": 86
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T02:05:37.477084Z",
"start_time": "2025-08-19T02:05:37.467743Z"
}
},
"cell_type": "code",
"source": [
"from bs4 import BeautifulSoup\n",
"import sqlite3"
],
"id": "59b26d9f105eae85",
"outputs": [],
"execution_count": 87
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T02:05:38.051715Z",
"start_time": "2025-08-19T02:05:38.047420Z"
}
},
"cell_type": "code",
"source": "db_path = '../data.db'",
"id": "37a70656848ceced",
"outputs": [],
"execution_count": 88
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T02:05:38.522754Z",
"start_time": "2025-08-19T02:05:38.515572Z"
}
},
"cell_type": "code",
"source": [
"conn = sqlite3.connect(db_path)\n",
"conn.execute('''CREATE TABLE \"questions\"\n",
"(\n",
" id INTEGER\n",
" constraint questions_pk\n",
" primary key autoincrement,\n",
" title TEXT,\n",
" chapter TEXT,\n",
" q_num text,\n",
" q_type text,\n",
" question TEXT not null,\n",
" a TEXT not null,\n",
" b TEXT not null,\n",
" c TEXT not null,\n",
" d TEXT not null,\n",
" a_result BLOB default false,\n",
" b_result BLOB default false,\n",
" c_result BLOB default false,\n",
" d_result BLOB default false,\n",
" explanation TEXT,\n",
" count integer default 3 not null\n",
")''')\n",
"\n",
"conn.execute('''CREATE TABLE \"answers_history\"\n",
"(\n",
" id INTEGER not null\n",
" constraint answers_history__questions_id_fk\n",
" references questions,\n",
" time_used INTEGER,\n",
" state INTEGER,\n",
" time text default CURRENT_TIMESTAMP\n",
")''')\n",
"\n",
"conn.execute('''CREATE TABLE url\n",
" (\n",
" id INTEGER not null,\n",
" url TEXT\n",
" )''')\n",
"\n",
"\n",
"conn.commit()\n"
],
"id": "d70a270099e8b056",
"outputs": [],
"execution_count": 89
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T02:05:39.660112Z",
"start_time": "2025-08-19T02:05:39.654188Z"
}
},
"cell_type": "code",
"source": [
"edge_options = Options()\n",
"#edge_options.add_argument(\"--headless\") # 可选:无界面模式\n",
"edge_options.add_argument(\"--disable-gpu\")\n",
"edge_options.add_argument(\"--no-sandbox\")\n",
"edge_options.add_argument(\"--disable-extensions\")\n",
"edge_options.add_argument(\"--disable-plugins\")\n",
"edge_options.add_argument(\"--disable-popup-blocking\")\n",
"edge_options.add_argument(\"--disable-infobars\")\n",
"edge_options.add_argument(\"--disable-notifications\")\n",
"edge_options.add_argument(\"--no-first-run\")\n",
"edge_options.add_argument(\"--no-default-browser-check\")\n",
"\n",
"user_data_dir = r\"D:\\code\\edge\"\n",
"edge_options.add_argument(f\"--user-data-dir={user_data_dir}\")\n",
"# 指定配置文件(可选,默认是 Default\n",
"edge_options.add_argument(\"--profile-directory=Default\")"
],
"id": "e4a35062c4549f44",
"outputs": [],
"execution_count": 90
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T02:05:42.646764Z",
"start_time": "2025-08-19T02:05:40.137721Z"
}
},
"cell_type": "code",
"source": [
"# 指定 EdgeDriver 路径(可选,若已配置环境变量可省略)\n",
"service = Service(executable_path=r\"D:\\app\\edgeDriver\\msedgedriver.exe\")\n",
"# 创建 Edge 浏览器实例\n",
"driver = webdriver.Edge(service=service, options=edge_options)"
],
"id": "9b48ddaca80598aa",
"outputs": [],
"execution_count": 91
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T01:58:29.792348Z",
"start_time": "2025-08-19T01:58:29.786681Z"
}
},
"cell_type": "code",
"source": [
"def get_web(url):\n",
" driver.get(url)\n",
"\n",
" # 等待页面渲染完成(例如等待 body 加载)\n",
" wait = WebDriverWait(driver, 720)\n",
" wait.until(EC.presence_of_element_located((By.TAG_NAME, \"body\")))\n",
" time.sleep(3)\n",
"\n",
" #进入背题模式\n",
" # clickable_element = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, \".setting-type.iconfont.icon-setting\")))\n",
" # clickable_element.click()\n",
" # wait.until(\n",
" # EC.element_to_be_clickable((By.CSS_SELECTOR, \".question-setting-button.ant-btn.ant-btn-default\"))).click()\n",
"\n",
" # 获取渲染后的 HTML\n",
" rendered_html = driver.page_source\n",
" return rendered_html"
],
"id": "2b02063fec8abbdd",
"outputs": [],
"execution_count": 80
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T01:58:30.323362Z",
"start_time": "2025-08-19T01:58:30.317695Z"
}
},
"cell_type": "code",
"source": [
"def list_get(lst, index, default=\"\"):\n",
" try:\n",
" return lst[index]\n",
" except IndexError:\n",
" return default"
],
"id": "de9650bb0e005d4a",
"outputs": [],
"execution_count": 81
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T01:58:30.884914Z",
"start_time": "2025-08-19T01:58:30.871279Z"
}
},
"cell_type": "code",
"source": [
"def write2db(rendered_html, info):\n",
" # 解析web并登记\n",
" soup = BeautifulSoup(rendered_html, 'html.parser')\n",
" full_left = soup.find_all('div', class_='pull-left')\n",
"\n",
" for questions in full_left:\n",
" for question in questions.children:\n",
" title_info = [] #num,type,question\n",
" answers_info = []\n",
" answers_correct_info = [False,False,False,False,]\n",
" explain_info = \"\"\n",
"\n",
" # 标题信息\n",
" for title in question.find_all('div', class_='p-stem'):\n",
" for element in title.children:\n",
" title_info.append(element.text)\n",
"\n",
"\n",
" # 题目信息\n",
" for answer in question.find_all('div', class_='answerClass'):\n",
" # 答案\n",
"\n",
"\n",
" # 带选项\n",
" text_elements = answer.select(\"div > div > p\")\n",
" for text_element in text_elements:\n",
" if text_element.text is not None and text_element.text != \"\":\n",
" answers_info.append(text_element.text)\n",
"\n",
"\n",
" # 解析\n",
" explains = question.find_all('div', class_='practise-answer-text')\n",
"\n",
" try:\n",
" for i in explains[0].text:\n",
" match i:\n",
" case \"A\":\n",
" answers_correct_info[0] = True\n",
" case \"B\":\n",
" answers_correct_info[1] = True\n",
" case \"C\":\n",
" answers_correct_info[2] = True\n",
" case \"D\":\n",
" answers_correct_info[3] = True\n",
" case _ :\n",
" print(i)\n",
" pass\n",
" except Exception as e:\n",
" # e.with_traceback()\n",
" # print(title_info)\n",
" pass\n",
"\n",
" if len(explains[0]) == 0:\n",
" print(\"0 answers found!!\", title_info)\n",
"\n",
" for explain in explains:\n",
" explain_info += str(explain.get_text(strip=True))\n",
"\n",
"\n",
" cursor = conn.execute(\n",
" \"INSERT INTO questions (title, chapter, q_num, q_type, question, a, b, c, d, a_result, b_result, c_result, d_result, explanation) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)\",\n",
" (\n",
" info[1],\n",
" info[2],\n",
" list_get(title_info,0),\n",
" list_get(title_info,1),\n",
" list_get(title_info,2),\n",
" list_get(answers_info,0),\n",
" list_get(answers_info,1),\n",
" list_get(answers_info,2),\n",
" list_get(answers_info,3),\n",
" list_get(answers_correct_info,0,False),\n",
" list_get(answers_correct_info,1,False),\n",
" list_get(answers_correct_info,2,False),\n",
" list_get(answers_correct_info,3,False),\n",
" explain_info,\n",
" )\n",
" )\n",
" inserted_id = cursor.lastrowid\n",
" conn.execute(\n",
" \"INSERT INTO url (id, url) VALUES (?, ?)\",\n",
" (inserted_id, info[0], )\n",
" )\n",
" conn.commit()"
],
"id": "c28a23cbd84f6ea0",
"outputs": [],
"execution_count": 82
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T01:58:31.409810Z",
"start_time": "2025-08-19T01:58:31.404300Z"
}
},
"cell_type": "code",
"source": [
"ttt = \"期货乐橙假题\"\n",
"bg_infos = [\n",
" [ttt, 0, 1385, 1],\n",
" [ttt, 0, 1392, 1],\n",
" [ttt, 0, 1393, 1],\n",
" [ttt, 0, 1394, 1],\n",
" [ttt, 0, 1395, 1],\n",
"]"
],
"id": "f8ed3be15b2a69a7",
"outputs": [],
"execution_count": 83
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T01:58:32.106767Z",
"start_time": "2025-08-19T01:58:32.099456Z"
}
},
"cell_type": "code",
"source": [
"def main():\n",
" try:\n",
" for bg_info in bg_infos:\n",
" for index in range(1, bg_info[3]+1):\n",
" #url = f\"https://www.bestlec.com/practise/practise?title=%E9%A1%BA%E5%BA%8F%E7%BB%83%E4%B9%A0&qBankId=39&qBankTitle=%E3%80%90%E6%9C%9F%E8%B4%A7%E6%B3%95%E8%A7%84%E3%80%91%E7%AB%A0%E8%8A%82%E7%BB%83%E4%B9%A0&chapterId={bg_info[2]}&practise=1&type=practise&selectSec={index}\"\n",
"\n",
" url = f\"https://www.bestlec.com/practise/practise?id={bg_info[2]}&qBankTitle=%E3%80%90%E6%9C%9F%E8%B4%A7%E6%B3%95%E8%A7%84%E3%80%91%E5%8E%86%E5%B9%B4%E7%9C%9F%E9%A2%98&title=%E7%9C%9F%E9%A2%98%E8%80%83%E8%AF%95&type=test&testType=search_paper\"\n",
"\n",
" rendered_html = get_web(url)\n",
" write2db(rendered_html, [url, bg_info[0], bg_info[1]])\n",
" except Exception as e:\n",
" print(\"error: \" + e)\n",
" finally:\n",
" try:\n",
" conn.close()\n",
" except Exception as e:\n",
" print(\"db:\", e)\n",
"\n",
" try:\n",
" driver.quit()\n",
" except Exception as e:\n",
" print(\"brother:\", e)"
],
"id": "fcfc560b46c29aaa",
"outputs": [],
"execution_count": 84
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T01:59:02.604021Z",
"start_time": "2025-08-19T01:58:32.772829Z"
}
},
"cell_type": "code",
"source": [
"if __name__ == '__main__':\n",
" main()"
],
"id": "811c9d3647c46f8b",
"outputs": [],
"execution_count": 85
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-08-19T01:59:03.138625Z",
"start_time": "2025-08-19T01:59:03.135518Z"
}
},
"cell_type": "code",
"source": "",
"id": "5224515d66fe0b",
"outputs": [],
"execution_count": null
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}