education_spider_render.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2022-08-03 18:07:19
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2022-10-31 12:59:17
  6. #
  7. # 每小时一百个网页
  8. # 添加使用新闻内容解析功能
  9. # 解析 url title author publish_time content images
  10. # js 渲染, 添加每个节点的位置信息,提升判断文章内容方法
  11. # 已完成
  12. import time
  13. import re
  14. import yaml
  15. from pymongo import MongoClient
  16. client = MongoClient("192.168.1.200", 27017)
  17. # collection = client['education']['response']
  18. collection = client['education']['hallowmas']
  19. from logger import LoggerHandler
  20. logger = LoggerHandler(name="education")
  21. logger.set_file_handler(filename="education.log")
  22. from selenium import webdriver
  23. from selenium.webdriver.chrome.service import Service
  24. from selenium.common.exceptions import TimeoutException
  25. from selenium.common.exceptions import NoSuchElementException
  26. from selenium.common.exceptions import StaleElementReferenceException
  27. #from selenium.common.exceptions import UnexpectedAlertPresentExpection
  28. from selenium.webdriver.support import expected_conditions as EC
  29. from selenium.webdriver.support.ui import WebDriverWait
  30. from selenium.webdriver.support.wait import WebDriverWait
  31. from selenium.webdriver.common.by import By
  32. from selenium.webdriver.common.keys import Keys
  33. from selenium.webdriver.common.action_chains import ActionChains
  34. from gne import GeneralNewsExtractor
  35. class AutoSpider(object):
  36. """使用 selenium 在 bing 搜索引擎提取通用 body 文本数据"""
  37. def __init__(self):
  38. super(AutoSpider, self).__init__()
  39. # 服务
  40. service = Service(executable_path="chromedriver")
  41. # 选项
  42. options = webdriver.ChromeOptions()
  43. # 无界面运行
  44. options.add_argument('--headless')
  45. # 以最高权限运行
  46. options.add_argument('--no-sandbox')
  47. # 配置代理
  48. # options.add_argument('proxy-server={}'.format(self.proxy_server))
  49. # 直接配置大小和set_window_size一样
  50. options.add_argument('--window-size={},{}'.format(1920, 1080))
  51. # 语言
  52. options.add_argument('--lang=en')
  53. # 禁用扩展
  54. options.add_argument('--disable-extensions')
  55. # options.add_argument('--disable-infobars')
  56. # 忽略证书错误
  57. options.add_argument('--ignore-certificate-errors')
  58. # 禁止通知
  59. options.add_argument('--disable-notifications')
  60. options.add_argument('--force-device-scale-factor=1')
  61. options.add_argument('--disable-dev-shm-usage')
  62. # 禁用浏览器侧导航
  63. options.add_argument('--disable-browser-side-navigation')
  64. # 不加载图片, 提升速度
  65. options.add_argument('blink-settings=imagesEnabled=false')
  66. # info:0 warning:1 error:2 fail:3
  67. options.add_argument('log-level=2')
  68. options.add_argument("--disable-blink-features=AutomationControlled")
  69. # 设置开发者模式启动,该模式下webdriver属性为正常值
  70. options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging'])
  71. options.add_experimental_option('useAutomationExtension', False)
  72. # 禁用浏览器弹窗
  73. prefs = {
  74. 'profile.default_content_setting_values': {
  75. 'notifications': 2
  76. }
  77. }
  78. options.add_experimental_option("prefs", prefs)
  79. self.driver = webdriver.Chrome(options=options, service=service)
  80. # 注入防检测脚本
  81. with open('./stealth.min.js') as fj:
  82. js = fj.read()
  83. self.driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": js})
  84. # 设置网页超时
  85. self.driver.set_page_load_timeout(60)
  86. self.driver.set_script_timeout(60)
  87. self.extractor = GeneralNewsExtractor()
  88. self.extract = self.extractor.extract
  89. with open('render.js') as fr:
  90. self.render = fr.read()
  91. def search_from_bing(self, search_query):
  92. """使用必应英文搜索"""
  93. try:
  94. self.driver.get('https://cn.bing.com/')
  95. except:
  96. self.driver.execute_script('window.stop()')
  97. logger.info("page timeout!")
  98. with open("url.txt","r",encoding="utf-8") as fu:
  99. url = fu.read()
  100. if url:
  101. try:
  102. self.driver.get(url)
  103. self.driver.implicitly_wait(3)
  104. except:
  105. self.driver.execute_script('window.stop')
  106. logger.info("page timeout!")
  107. WebDriverWait(self.driver, 180, 10).until(EC.visibility_of_element_located((By.ID,'est_en')))
  108. self.driver.find_element(By.ID, "est_en").click()
  109. time.sleep(3)
  110. self.driver.implicitly_wait(3)
  111. return True
  112. WebDriverWait(self.driver, 180, 5).until(EC.visibility_of_element_located((By.ID,'sb_form_q')))
  113. self.driver.find_element(By.ID, "sb_form_q").send_keys(search_query + " (language:en)")
  114. time.sleep(3)
  115. self.driver.implicitly_wait(3)
  116. self.driver.find_element(By.ID, "sb_form_q").send_keys(Keys.ENTER)
  117. time.sleep(4)
  118. self.driver.implicitly_wait(3)
  119. WebDriverWait(self.driver, 180, 10).until(EC.visibility_of_element_located((By.ID,'est_en')))
  120. self.driver.find_element(By.ID, "est_en").click()
  121. time.sleep(3)
  122. WebDriverWait(self.driver, 180, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".fs_label")))
  123. self.driver.find_element(By.CSS_SELECTOR, ".fs_label").click()
  124. WebDriverWait(self.driver, 180, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "#ftrD_Any_time > a:nth-child(3)")))
  125. self.driver.find_element(By.CSS_SELECTOR, "#ftrD_Any_time > a:nth-child(3)").click()
  126. self.driver.implicitly_wait(3)
  127. return True
  128. def search_from_google(self, search_query):
  129. try:
  130. self.driver.get("https://www.google.com")
  131. except:
  132. self.driver.execute_script('window.stop()')
  133. logger.info("page timeout")
  134. WebDriverWait(self.driver, 180, 5).until(EC.visibility_of_element_located((By.NAME, "q")))
  135. self.driver.find_element(By.NAME, "q").send_keys(search_query)
  136. time.sleep(3)
  137. self.driver.implicitly_wait(3)
  138. self.driver.find_element(By.NAME, "q").send_keys(Keys.ENTER)
  139. time.sleep(4)
  140. self.driver.implicitly_wait(3)
  141. return True
  142. def get_next_page_form_bing(self):
  143. """必应点击下一页"""
  144. try:
  145. self.driver.refresh()
  146. time.sleep(3)
  147. self.driver.implicitly_wait(3)
  148. # 有下一页按钮
  149. if self.driver.find_element(By.CSS_SELECTOR, "div.sw_next").text:
  150. self.driver.find_element(By.CSS_SELECTOR, "div.sw_next").click()
  151. # 没有下一页按钮
  152. elif self.driver.find_element(By.XPATH, "//a[@class='sb_pagS sb_pagS_bp b_widePag sb_bp']/parent::li/following-sibling::*").text:
  153. self.driver.find_element(By.XPATH, "//a[@class='sb_pagS sb_pagS_bp b_widePag sb_bp']/parent::li/following-sibling::*").click()
  154. # 页面加载错误
  155. else:
  156. return False
  157. # 返回成功
  158. with open("url.txt","w",encoding="utf-8") as fu:
  159. fu.write(self.driver.current_url)
  160. return True
  161. except Exception as e:
  162. logger.error(e)
  163. return False
  164. def get_next_page_form_google(self):
  165. """谷歌点击下一页"""
  166. try:
  167. self.driver.refresh()
  168. time.sleep(3)
  169. self.driver.implicitly_wait(3)
  170. # 有下一页按钮
  171. if self.driver.find_element(By.ID, "pnnext").text:
  172. self.driver.find_element(By.ID, "pnnext").click()
  173. # 没有下一页按钮
  174. # elif self.driver.find_element(By.XPATH, "//a[@class='sb_pagS sb_pagS_bp b_widePag sb_bp']/parent::li/following-sibling::*").text:
  175. # self.driver.find_element(By.XPATH, "//a[@class='sb_pagS sb_pagS_bp b_widePag sb_bp']/parent::li/following-sibling::*").click()
  176. # 页面加载错误
  177. else:
  178. return False
  179. # 返回成功
  180. return True
  181. except Exception as e:
  182. logger.error(e)
  183. return False
  184. def click_title_from_bing(self, hotkey):
  185. """点击当前列表中的所有网页"""
  186. for epoch in range(50):
  187. # 遍历标题列表
  188. for index, item in enumerate(self.driver.find_elements(By.CSS_SELECTOR, "#b_results li.b_algo")):
  189. # 判断 url 是否已获取
  190. try:
  191. href = item.find_element(By.CSS_SELECTOR, "h2 a").get_attribute("href")
  192. # pdf 文件
  193. if href.endswith('pdf'):
  194. continue
  195. except StaleElementReferenceException as e:
  196. # DOM 树结构改变,尝试重新获取
  197. try:
  198. item = self.driver.find_element(By.XPATH, "//*[@id='b_results']//li[{}]".format(index+1))
  199. href = item.find_element(By.CSS_SELECTOR, "h2 a").get_attribute("href")
  200. except Exception as e:
  201. continue
  202. # 判重成功,跳过页面
  203. if collection.find_one({"url": href}):
  204. continue
  205. # 尝试打开网页
  206. try:
  207. time.sleep(10)
  208. # 句柄为 1,可进行点击操作
  209. if len(self.driver.window_handles) == 1:
  210. try:
  211. item.find_element(By.TAG_NAME, "h2").click()
  212. except Exception as e:
  213. try:
  214. element = item.find_element(By.TAG_NAME, "h2")
  215. self.driver.execute_script('arguments[0].click()', element)
  216. except Exception as e:
  217. logger.error(e)
  218. # 句柄不为 1,清理句柄
  219. else:
  220. # 遍历句柄
  221. for i in range(len(self.driver.window_handles) - 1):
  222. # 切换句柄
  223. self.driver.switch_to.window(self.driver.window_handles[1])
  224. # 关闭句柄
  225. self.driver.close()
  226. # 清理完毕,执行点击操作
  227. item.find_element(By.TAG_NAME, "h2").click()
  228. # 打开网页失败
  229. except Exception as e:
  230. logger.error(e)
  231. # 打开网页成功
  232. else:
  233. # 判断成功打开标签页
  234. if (len(self.driver.window_handles) == 2):
  235. # 切换标签页
  236. self.driver.switch_to.window(self.driver.window_handles[1])
  237. # 判断是否为文章
  238. if self.check_article():
  239. # 读文章
  240. self.parse_page_from_article(hotkey)
  241. time.sleep(10)
  242. finally:
  243. # 返回原始页
  244. self.driver.switch_to.window(self.driver.window_handles[0])
  245. self.driver.implicitly_wait(5)
  246. # 列表读取完毕,打开下一页
  247. else:
  248. # 打开下一页失败,直接退出
  249. if not self.get_next_page_form_bing():
  250. break
  251. # 打开成功,全局等待加载
  252. self.driver.implicitly_wait(5)
  253. # 切换到第一句柄
  254. self.driver.switch_to.window(self.driver.window_handles[0])
  255. def click_title_from_google(self, hotkey):
  256. """点击当前列表中的所有网页"""
  257. for epoch in range(10):
  258. # 遍历标题列表
  259. for index, item in enumerate(self.driver.find_elements(By.XPATH, '//*[@id="rso"]//div[@class="yuRUbf"]')):
  260. # 判断 url 是否已获取
  261. try:
  262. href = item.find_element(By.CSS_SELECTOR, "a").get_attribute("href")
  263. # pdf 文件
  264. if href.endswith('pdf'):
  265. continue
  266. except StaleElementReferenceException:
  267. # DOM 树结构改变,尝试重新获取
  268. try:
  269. item = self.driver.find_elements(By.XPATH, '//*[@id="rso"]//div[@class="yuRUbf"]')[index]
  270. href = item.find_element(By.CSS_SELECTOR, "a").get_attribute("href")
  271. except Exception as e:
  272. continue
  273. # 判重成功,跳过页面
  274. if collection.find_one({"url": href}):
  275. continue
  276. # 尝试打开网页
  277. try:
  278. time.sleep(10)
  279. # 句柄为 1,可进行点击操作
  280. if len(self.driver.window_handles) == 1:
  281. try:
  282. self.driver.execute_script('window.open("' + item.find_element(By.XPATH, "a").get_attribute("href") + '","_blank");')
  283. except Exception as e:
  284. logger.error(e)
  285. # try:
  286. # element = item.find_element(By.TAG_NAME, "h2")
  287. # self.driver.execute_script('arguments[0].click()', element)
  288. # except Exception as e:
  289. # logger.error(e)
  290. # 句柄不为 1,清理句柄
  291. else:
  292. # 遍历句柄
  293. for i in range(len(self.driver.window_handles) - 1):
  294. # 切换句柄
  295. self.driver.switch_to.window(self.driver.window_handles[1])
  296. # 关闭句柄
  297. self.driver.close()
  298. # 清理完毕,执行点击操作
  299. self.driver.execute_script('window.open("' + item.find_element(By.XPATH, "a").get_attribute("href") + '","_blank");')
  300. # 打开网页失败
  301. except Exception as e:
  302. logger.error(e)
  303. # 打开网页成功
  304. else:
  305. # 判断成功打开标签页
  306. if (len(self.driver.window_handles) == 2):
  307. # 切换标签页
  308. self.driver.switch_to.window(self.driver.window_handles[1])
  309. # 判断是否为文章
  310. if self.check_article():
  311. # 读文章
  312. self.parse_page_from_article(hotkey)
  313. time.sleep(10)
  314. finally:
  315. # 返回原始页
  316. self.driver.switch_to.window(self.driver.window_handles[0])
  317. self.driver.implicitly_wait(5)
  318. # 列表读取完毕,打开下一页
  319. else:
  320. # 打开下一页失败,直接退出
  321. if not self.get_next_page_form_google():
  322. break
  323. # 打开成功,全局等待加载
  324. self.driver.implicitly_wait(5)
  325. # 切换到第一句柄
  326. self.driver.switch_to.window(self.driver.window_handles[0])
  327. def parse_page_from_article(self, hotkey):
  328. """解析网页body"""
  329. # 滚动到底层
  330. try:
  331. height = self.driver.execute_script('return document.body.scrollHeight')
  332. if height > 1080:
  333. self.driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
  334. except TimeoutException as e:
  335. # 超时尝试停止加载
  336. try:
  337. self.driver.execute_script('window.stop ? window.stop() : document.execCommand("Stop");')
  338. except TimeoutException as e:
  339. logger.error('Timeout!')
  340. except Exception as e:
  341. logger.error(e)
  342. except UnexpectedAlertPresentExpection as e:
  343. logger.error(e)
  344. return
  345. except Exception as e:
  346. self.driver.close()
  347. logger.error(e)
  348. return
  349. page_source = ""
  350. url = ""
  351. for x in range(3):
  352. try:
  353. # 读取网页源码
  354. if not page_source:
  355. page_source = self.driver.page_source
  356. # 解析网页 url
  357. if not url:
  358. url = self.driver.current_url
  359. except TimeoutException as e:
  360. logger.info('Timeout!')
  361. except Exception as e:
  362. logger.error(e)
  363. if page_source:
  364. visible = False
  365. try:
  366. self.driver.execute_script(self.render)
  367. visible = True
  368. except Exception as e:
  369. logger.error(e)
  370. try:
  371. #if visible:
  372. # result = self.extract(page_source, use_visiable_info=True)
  373. #else:
  374. # result = self.extract(page_source)
  375. result = self.extract(page_source)
  376. except Exception as e:
  377. result = {"title":"", "author":"", "publish_time":"", "content":"", "images":[]}
  378. result['page_source'] = page_source
  379. else:
  380. self.driver.close()
  381. return
  382. if url:
  383. result['url'] = url
  384. else:
  385. self.driver.close()
  386. return
  387. if not result['title']:
  388. # 解析网页标题
  389. try:
  390. result['title'] = self.driver.title
  391. except Exception as e:
  392. logger.error(e)
  393. # 元数据
  394. metadata = dict([])
  395. for meta in self.driver.find_elements(By.TAG_NAME, "meta"):
  396. try:
  397. if meta.get_attribute("name"):
  398. metadata[meta.get_attribute("name")] = meta.get_attribute("content")
  399. except Exception as e:
  400. pass
  401. result['metadata'] = metadata
  402. if not result['content']:
  403. # 提取正文
  404. try:
  405. result['content'] = self.driver.find_element(By.XPATH, "//body").text
  406. except NoSuchElementException as e:
  407. self.driver.close()
  408. return
  409. except TimeoutException as e:
  410. self.driver.close()
  411. return
  412. except Exception as e:
  413. self.driver.close()
  414. logger.error(e)
  415. return
  416. result["crawl_time"] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  417. if self.check_useful(result):
  418. result['used'] = 0
  419. else:
  420. result["used"] = 1
  421. result['hotkey'] = hotkey
  422. # 存储
  423. try:
  424. logger.info(result)
  425. # 再次查重
  426. if not collection.find_one({"url":result["url"]}):
  427. collection.insert_one(result)
  428. except Exception as e:
  429. logger.error(e)
  430. finally:
  431. self.driver.close()
  432. return
  433. def check_article(self):
  434. """判断网页是否为文章"""
  435. return True
  436. def check_useful(self, result):
  437. """判断网页是否可用"""
  438. if (not result['content']) or (not result['title']) or (len(result['content']) < 200) or re.search(r"[\u4e00-\u9fa5]", result['title']) or ("403" in result["title"]) or ("404" in result["title"]) or ("502" in result["title"]) or ("Just a moment..." in result["title"]):
  439. return False
  440. else:
  441. return True
  442. def start_crawl(self, query, engine):
  443. """启动爬虫"""
  444. if (engine == 'bing') and self.search_from_bing(query):
  445. self.click_title_from_bing(query)
  446. elif (engine == 'google') and self.search_from_google(query):
  447. self.click_title_from_google(query)
  448. def close(self):
  449. """关闭爬虫"""
  450. self.driver.close()
  451. def main(engine="bing"):
  452. # 关键词列表
  453. # # with open('querys.txt', 'r', encoding='utf-8') as fp:
  454. # # querys = fp.read()
  455. with open("querys.yaml", "r", encoding="utf-8") as fp:
  456. querys = yaml.safe_load(fp.read())
  457. with open("temp","r",encoding="utf-8") as ft:
  458. tk = ft.read()
  459. if "Adult education" in tk:
  460. tk = None
  461. if not tk:
  462. start = True
  463. else:
  464. start = False
  465. # 启动爬虫
  466. robot = AutoSpider()
  467. # 遍历关键词
  468. for key in querys.keys():
  469. for cont in querys[key][0].keys():
  470. for query in querys[key][0][cont]:
  471. if start:
  472. robot.start_crawl(cont+" "+query, engine)
  473. with open("temp","w",encoding="utf-8") as ft:
  474. ft.write(key+cont+query)
  475. elif (key+cont+query == tk):
  476. start = True
  477. robot.start_crawl(cont+" "+query, engine)
  478. with open("temp","w",encoding="utf-8") as ft:
  479. ft.write(key+cont+query)
  480. else:
  481. continue
  482. # 关闭爬虫
  483. robot.close()
  484. if __name__ == '__main__':
  485. main('bing')