diff --git a/crawler/get_tianyi_1.ipynb b/crawler/get_tianyi_1.ipynb index c977068..2142b1f 100644 --- a/crawler/get_tianyi_1.ipynb +++ b/crawler/get_tianyi_1.ipynb @@ -6,8 +6,8 @@ "metadata": { "collapsed": true, "ExecuteTime": { - "end_time": "2025-08-21T08:51:45.729834Z", - "start_time": "2025-08-21T08:51:45.724789Z" + "end_time": "2025-08-22T00:51:15.383830Z", + "start_time": "2025-08-22T00:51:15.162928Z" } }, "source": [ @@ -21,13 +21,13 @@ "from selenium.webdriver.edge.options import Options" ], "outputs": [], - "execution_count": 10 + "execution_count": 1 }, { "metadata": { "ExecuteTime": { - "end_time": "2025-08-21T08:51:45.747072Z", - "start_time": "2025-08-21T08:51:45.743237Z" + "end_time": "2025-08-22T00:51:16.137665Z", + "start_time": "2025-08-22T00:51:16.016527Z" } }, "cell_type": "code", @@ -37,13 +37,13 @@ ], "id": "f184b255d5098302", "outputs": [], - "execution_count": 11 + "execution_count": 2 }, { "metadata": { "ExecuteTime": { - "end_time": "2025-08-21T08:51:45.773737Z", - "start_time": "2025-08-21T08:51:45.769129Z" + "end_time": "2025-08-22T01:24:38.259284Z", + "start_time": "2025-08-22T01:24:38.253051Z" } }, "cell_type": "code", @@ -53,13 +53,26 @@ ], "id": "4813fcf4dea28b8d", "outputs": [], - "execution_count": 12 + "execution_count": 54 }, { "metadata": { "ExecuteTime": { - "end_time": "2025-08-21T08:51:45.797561Z", - "start_time": "2025-08-21T08:51:45.790647Z" + "end_time": "2025-08-22T01:24:21.532983Z", + "start_time": "2025-08-22T01:24:21.528098Z" + } + }, + "cell_type": "code", + "source": "# conn.close()", + "id": "8ea63e4cb82fe0c", + "outputs": [], + "execution_count": 53 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-22T00:51:16.185320Z", + "start_time": "2025-08-22T00:51:16.176280Z" } }, "cell_type": "code", @@ -83,13 +96,13 @@ ], "id": "e5632e44a52d5dc4", "outputs": [], - "execution_count": 13 + "execution_count": 4 }, { "metadata": { "ExecuteTime": { - "end_time": "2025-08-21T08:51:47.136948Z", - "start_time": "2025-08-21T08:51:45.814223Z" + "end_time": "2025-08-22T00:51:18.395Z", + "start_time": "2025-08-22T00:51:16.198246Z" } }, "cell_type": "code", @@ -101,19 +114,21 @@ ], "id": "28b1479c3decc6b1", "outputs": [], - "execution_count": 14 + "execution_count": 5 }, { "metadata": { "ExecuteTime": { - "end_time": "2025-08-21T08:51:53.084554Z", - "start_time": "2025-08-21T08:51:47.152588Z" + "end_time": "2025-08-22T00:51:27.692312Z", + "start_time": "2025-08-22T00:51:18.413131Z" } }, "cell_type": "code", "source": [ "driver.get(\"https://www.tianyiwangxiao.com/new/question-bank/learn-center-analyze/4d60c96ef05c452b812654e78af7701a/1957604601548296194?from=ht2\")\n", "\n", + "\"https://www.tianyiwangxiao.com/new/question-bank/learn-center-analyze/94cfba022e2f4c7ebbeaa400576b3a9a/1958703246433423361?from=ht2\"\n", + "\n", "# 等待页面渲染完成(例如等待 body 加载)\n", "wait = WebDriverWait(driver, 720)\n", "wait.until(EC.presence_of_element_located((By.TAG_NAME, \"body\")))\n", @@ -122,148 +137,219 @@ ], "id": "779f88e1c3670c02", "outputs": [], - "execution_count": 15 + "execution_count": 6 }, { "metadata": { "ExecuteTime": { - "end_time": "2025-08-21T08:51:59.934014Z", - "start_time": "2025-08-21T08:51:59.829632Z" + "end_time": "2025-08-22T00:51:27.716520Z", + "start_time": "2025-08-22T00:51:27.708786Z" } }, "cell_type": "code", "source": [ - "#进入背题模式\n", - "clickable_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, \".el-icon-right.next\")))\n", - "clickable_element.click()\n", - "\n" + "def next_page():\n", + " clickable_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, \".el-icon-right.next\")))\n", + " clickable_element.click()\n", + "\n", + "def get_html():\n", + " rendered_html = driver.page_source\n", + " return rendered_html" ], "id": "721f5a8a872bfdce", "outputs": [], - "execution_count": 17 + "execution_count": 7 }, { "metadata": { "ExecuteTime": { - "end_time": "2025-08-21T08:52:07.855382Z", - "start_time": "2025-08-21T08:52:07.834085Z" + "end_time": "2025-08-22T01:24:45.201186Z", + "start_time": "2025-08-22T01:24:45.184772Z" } }, "cell_type": "code", "source": [ - "# 获取渲染后的 HTML\n", - "rendered_html = driver.page_source" - ], - "id": "aa728e660ee9bbe5", - "outputs": [], - "execution_count": 18 - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2025-08-21T09:03:09.025560Z", - "start_time": "2025-08-21T09:03:08.973835Z" - } - }, - "cell_type": "code", - "source": [ - "soup = BeautifulSoup(rendered_html, 'html.parser')\n", + "def html_parser(rendered_html):\n", + " soup = BeautifulSoup(rendered_html, 'html.parser')\n", "\n", - "title = soup.find_all('p', class_='title')\n", + " title = soup.find_all('p', class_='title')\n", "\n", - "out_options_box = soup.find_all('div', class_='options-box')\n", + " out_options_box = soup.find_all('div', class_='options-box')\n", "\n", - "analyze = soup.find_all('div', class_='analyze')" + " analyze = soup.find_all('div', class_='analyze')\n", + "\n", + "\n", + " result={\"title\":title[0].text.strip(), \"analyze\":analyze[0].text.strip()}\n", + "\n", + " if (len(out_options_box)!=1):\n", + " raise out_options_box\n", + " out_options = out_options_box[0].find_all(\"div\",class_=\"options-item\")\n", + "\n", + " if len(out_options)==4:\n", + " # 多选\n", + " for out_option in out_options:\n", + "\n", + " abcd = out_option.find(\"p\",\"label\")\n", + "\n", + " trueFalse = False\n", + " if \"success-active\" in abcd.get(\"class\"):\n", + " trueFalse = True\n", + "\n", + " abcd = abcd.text.strip().lower()\n", + "\n", + " answer = out_option.find(\"p\",\"text\").text.strip()\n", + "\n", + " result[abcd] = [answer, trueFalse]\n", + " else:\n", + " # 单选\n", + " def get_tf():\n", + " out_options_box = soup.find_all('div', class_='answer-box')\n", + " for i in out_options_box:\n", + " for ii in i.find_all('div', class_='CORRECT'):\n", + " if ii.text == \"正确\":\n", + " return True\n", + " elif ii.text == \"错误\":\n", + " return False\n", + " print(out_options_box)\n", + " return 0\n", + " result[\"tf\"] = get_tf()\n", + "\n", + " return result\n", + "\n" ], "id": "5db0bbd564c0b53f", "outputs": [], - "execution_count": 33 + "execution_count": 55 }, { "metadata": { "ExecuteTime": { - "end_time": "2025-08-21T09:03:10.108380Z", - "start_time": "2025-08-21T09:03:10.102301Z" - } - }, - "cell_type": "code", - "source": "print(title)", - "id": "9ae9f13772cfed6a", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[

根据《期货经营机构投资者适当性管理实施指引(试行)》,经营机构评估,划分所销售产品或者所提供服务的风险等级时,涉及投资组合的产品或服务的,下列表述中正确的是(  )。

]\n" - ] - } - ], - "execution_count": 34 - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2025-08-21T09:11:05.144092Z", - "start_time": "2025-08-21T09:11:05.136517Z" + "end_time": "2025-08-22T01:41:12.394198Z", + "start_time": "2025-08-22T01:41:12.386634Z" } }, "cell_type": "code", "source": [ - "out_options_box = soup.find_all('div', class_='options-box')\n", + "def write2db(index, result):\n", "\n", + " if \"tf\" not in result:\n", "\n", - "if (len(out_options_box)!=1):\n", - " raise out_options_box\n", - "out_options = out_options_box[0].find_all(\"div\",class_=\"options-item\")\n", + " conn.execute(\n", + " \"INSERT INTO questions (title, chapter, q_num, q_type, question, a, b, c, d, a_result, b_result, c_result, d_result, explanation) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)\",\n", + " (\n", + " info[0],\n", + " info[1],\n", + " index,\n", + " \"多选题\",\n", + " result.get(\"title\"),\n", + " result.get(\"a\")[0],\n", + " result.get(\"b\")[0],\n", + " result.get(\"c\")[0],\n", + " result.get(\"d\")[0],\n", + " result.get(\"a\")[1],\n", + " result.get(\"b\")[1],\n", + " result.get(\"c\")[1],\n", + " result.get(\"d\")[1],\n", + " result.get(\"analyze\"),\n", + " )\n", + " )\n", "\n", - "for out_option in out_options:\n", - " abcd = out_option.find(\"p\",\"label\")\n", + " else:\n", + " if result[\"tf\"] == 0:\n", + " return\n", "\n", - " trueFalse = False\n", - " if \"success-active\" in abcd.get(\"class\"):\n", - " trueFalse = True\n", + " conn.execute(\n", + " \"INSERT INTO questions (title, chapter, q_num, q_type, question, a, b, c, d, a_result, b_result, c_result, d_result, explanation) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)\",\n", + " (\n", + " info[0],\n", + " info[1],\n", + " index,\n", + " \"判断题\",\n", + " result.get(\"title\"),\n", + " \"\",\n", + " \"\",\n", + " \"\",\n", + " \"\",\n", + " 1 if result[\"tf\"] else 0,\n", + " 0 if result[\"tf\"] else 1,\n", + " \"\",\n", + " \"\",\n", + " result.get(\"analyze\"),\n", + " )\n", + " )\n", "\n", - " abcd = abcd.text.strip()\n", - "\n", - " answer = out_option.find(\"p\",\"text\").text.strip()\n", - "\n", - " print(abcd, answer, trueFalse)" + " conn.commit()" ], - "id": "11d9051ab089122d", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "A 可以按照产品或服务对应的任何一个风险等级进行评估 False\n", - "B 应当按照产品或服务最低风险等级进行评估 False\n", - "C 应当按照产品或服务最高风险等级进行评估 False\n", - "D 应当按照产品或服务整体风险等级进行评估 True\n" - ] - } - ], - "execution_count": 39 + "id": "853f278c1123cae1", + "outputs": [], + "execution_count": 69 }, { "metadata": { "ExecuteTime": { - "end_time": "2025-08-21T08:55:04.344999Z", - "start_time": "2025-08-21T08:55:04.339191Z" + "end_time": "2025-08-22T01:46:12.973092Z", + "start_time": "2025-08-22T01:46:12.968961Z" } }, "cell_type": "code", - "source": "print(analyze)", - "id": "b7f43a482ce3c619", + "source": "info = [\"天一\",0,130]", + "id": "71ef002122c67647", + "outputs": [], + "execution_count": 81 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-22T01:50:44.170181Z", + "start_time": "2025-08-22T01:50:27.773842Z" + } + }, + "cell_type": "code", + "source": [ + "for i in range(info[2]):\n", + " p = get_html()\n", + " result = html_parser(p)\n", + " write2db(i, result)\n", + " next_page()\n", + "\n" + ], + "id": "11d9051ab089122d", + "outputs": [], + "execution_count": 89 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-22T01:10:27.088143Z", + "start_time": "2025-08-22T01:10:27.076521Z" + } + }, + "cell_type": "code", + "source": "", + "id": "ad769b774bac8989", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[
涉及投资组合的产品或服务,应当按照产品或服务整体风险等级进行评估。 

 
]\n" + "
正确
\n" ] } ], - "execution_count": 22 + "execution_count": 40 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-22T01:06:54.877198Z", + "start_time": "2025-08-22T01:06:54.867849Z" + } + }, + "cell_type": "code", + "source": "", + "id": "e1474fd283674850", + "outputs": [], + "execution_count": 35 }, { "metadata": {}, @@ -271,7 +357,7 @@ "outputs": [], "execution_count": null, "source": "", - "id": "ad769b774bac8989" + "id": "54ad268f864e1f6c" } ], "metadata": { diff --git a/data.db.zip b/data.db.zip index 31fe72d..e2f5a29 100644 Binary files a/data.db.zip and b/data.db.zip differ