education_spider_render_v2.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2022-08-03 18:07:19
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2022-09-26 16:05:39
  6. #
  7. # 每小时一百个网页
  8. # 添加使用新闻内容解析功能
  9. # 解析 url title author publish_time content images
  10. # js 渲染, 添加每个节点的位置信息,提升判断文章内容方法
  11. # 已完成
  12. import time
  13. import re
  14. import platform
  15. import yaml
  16. from pymongo import MongoClient
  17. client = MongoClient("192.168.1.200", 27017)
  18. # collection = client['education']['response']
  19. collection = client['education']['test']
  20. from logger import LoggerHandler
  21. logger = LoggerHandler(name="education")
  22. logger.set_file_handler(filename="education.log")
  23. from selenium import webdriver
  24. from selenium.webdriver.chrome.service import Service
  25. from selenium.common.exceptions import TimeoutException
  26. from selenium.common.exceptions import NoSuchElementException
  27. from selenium.common.exceptions import StaleElementReferenceException
  28. from selenium.common.exceptions import UnexpectedAlertPresentExpection
  29. from selenium.webdriver.support import expected_conditions as EC
  30. from selenium.webdriver.support.ui import WebDriverWait
  31. from selenium.webdriver.support.wait import WebDriverWait
  32. from selenium.webdriver.common.by import By
  33. from selenium.webdriver.common.keys import Keys
  34. from selenium.webdriver.common.action_chains import ActionChains
  35. from gne import GeneralNewsExtractor
  36. class AutoSpider(object):
  37. """使用 selenium 在 bing 搜索引擎提取通用 body 文本数据"""
  38. def __init__(self):
  39. super(AutoSpider, self).__init__()
  40. # 服务
  41. if (platform.system() == "Windows"):
  42. service = Service(executable_path="chromedriver.exe")
  43. else:
  44. service = Service(executable_path="chromedriver")
  45. # 选项
  46. options = webdriver.ChromeOptions()
  47. # 无界面运行
  48. options.add_argument('--headless')
  49. # 以最高权限运行
  50. options.add_argument('--no-sandbox')
  51. # 配置代理
  52. # options.add_argument('proxy-server={}'.format(self.proxy_server))
  53. # 直接配置大小和set_window_size一样
  54. options.add_argument('--window-size={},{}'.format(1920, 1080))
  55. # 语言
  56. options.add_argument('--lang=en')
  57. # 禁用扩展
  58. options.add_argument('--disable-extensions')
  59. # options.add_argument('--disable-infobars')
  60. # 忽略证书错误
  61. options.add_argument('--ignore-certificate-errors')
  62. # 禁止通知
  63. options.add_argument('--disable-notifications')
  64. options.add_argument('--force-device-scale-factor=1')
  65. options.add_argument('--disable-dev-shm-usage')
  66. # 禁用浏览器侧导航
  67. options.add_argument('--disable-browser-side-navigation')
  68. # 不加载图片, 提升速度
  69. options.add_argument('blink-settings=imagesEnabled=false')
  70. # info:0 warning:1 error:2 fail:3
  71. options.add_argument('log-level=2')
  72. options.add_argument("--disable-blink-features=AutomationControlled")
  73. # 设置开发者模式启动,该模式下webdriver属性为正常值
  74. options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging'])
  75. options.add_experimental_option('useAutomationExtension', False)
  76. # 禁用浏览器弹窗
  77. prefs = {
  78. 'profile.default_content_setting_values': {
  79. 'notifications': 2
  80. }
  81. }
  82. options.add_experimental_option("prefs", prefs)
  83. self.driver = webdriver.Chrome(options=options, service=service)
  84. # 注入防检测脚本
  85. with open('./stealth.min.js') as fj:
  86. js = fj.read()
  87. self.driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": js})
  88. # 设置网页超时
  89. self.driver.set_page_load_timeout(60)
  90. self.driver.set_script_timeout(60)
  91. self.extractor = GeneralNewsExtractor()
  92. self.extract = self.extractor.extract
  93. with open('render.js') as fr:
  94. self.render = fr.read()
  95. def search_from_bing(self, search_query):
  96. """使用必应英文搜索"""
  97. try:
  98. self.driver.get('https://cn.bing.com/')
  99. except:
  100. self.driver.execute_script('window.stop()')
  101. logger.info("page timeout!")
  102. WebDriverWait(self.driver, 180, 5).until(EC.visibility_of_element_located((By.ID,'sb_form_q')))
  103. self.driver.find_element(By.ID, "sb_form_q").send_keys(search_query + " (language:en)")
  104. time.sleep(3)
  105. self.driver.implicitly_wait(3)
  106. self.driver.find_element(By.ID, "sb_form_q").send_keys(Keys.ENTER)
  107. time.sleep(4)
  108. self.driver.implicitly_wait(3)
  109. WebDriverWait(self.driver, 180, 10).until(EC.visibility_of_element_located((By.ID,'est_en')))
  110. self.driver.find_element(By.ID, "est_en").click()
  111. time.sleep(3)
  112. self.driver.implicitly_wait(3)
  113. return True
  114. def search_from_google(self, search_query):
  115. try:
  116. self.driver.get("https://www.google.com")
  117. except:
  118. self.driver.execute_script('window.stop()')
  119. logger.info("page timeout")
  120. WebDriverWait(self.driver, 180, 5).until(EC.visibility_of_element_located((By.NAME, "q")))
  121. self.driver.find_element(By.NAME, "q").send_keys(search_query)
  122. time.sleep(3)
  123. self.driver.implicitly_wait(3)
  124. self.driver.find_element(By.NAME, "q").send_keys(Keys.ENTER)
  125. time.sleep(4)
  126. self.driver.implicitly_wait(3)
  127. return True
  128. def get_next_page_form_bing(self):
  129. """必应点击下一页"""
  130. try:
  131. self.driver.refresh()
  132. time.sleep(3)
  133. self.driver.implicitly_wait(3)
  134. # 有下一页按钮
  135. if self.driver.find_element(By.CSS_SELECTOR, "div.sw_next").text:
  136. self.driver.find_element(By.CSS_SELECTOR, "div.sw_next").click()
  137. # 没有下一页按钮
  138. elif self.driver.find_element(By.XPATH, "//a[@class='sb_pagS sb_pagS_bp b_widePag sb_bp']/parent::li/following-sibling::*").text:
  139. self.driver.find_element(By.XPATH, "//a[@class='sb_pagS sb_pagS_bp b_widePag sb_bp']/parent::li/following-sibling::*").click()
  140. # 页面加载错误
  141. else:
  142. return False
  143. # 返回成功
  144. return True
  145. except Exception as e:
  146. logger.error(e)
  147. return False
  148. def get_next_page_form_google(self):
  149. """谷歌点击下一页"""
  150. try:
  151. self.driver.refresh()
  152. time.sleep(3)
  153. self.driver.implicitly_wait(3)
  154. # 有下一页按钮
  155. if self.driver.find_element(By.ID, "pnnext").text:
  156. self.driver.find_element(By.ID, "pnnext").click()
  157. # 没有下一页按钮
  158. # elif self.driver.find_element(By.XPATH, "//a[@class='sb_pagS sb_pagS_bp b_widePag sb_bp']/parent::li/following-sibling::*").text:
  159. # self.driver.find_element(By.XPATH, "//a[@class='sb_pagS sb_pagS_bp b_widePag sb_bp']/parent::li/following-sibling::*").click()
  160. # 页面加载错误
  161. else:
  162. return False
  163. # 返回成功
  164. return True
  165. except Exception as e:
  166. logger.error(e)
  167. return False
  168. def click_title_from_bing(self, hotkey):
  169. """点击当前列表中的所有网页"""
  170. for epoch in range(1):
  171. # 遍历标题列表
  172. for index, item in enumerate(self.driver.find_elements(By.CSS_SELECTOR, "#b_results li.b_algo")):
  173. # 判断 url 是否已获取
  174. try:
  175. href = item.find_element(By.CSS_SELECTOR, "h2 a").get_attribute("href")
  176. # pdf 文件
  177. if href.endswith('pdf'):
  178. continue
  179. except StaleElementReferenceException as e:
  180. # DOM 树结构改变,尝试重新获取
  181. try:
  182. item = self.driver.find_element(By.XPATH, "//*[@id='b_results']//li[{}]".format(index+1))
  183. href = item.find_element(By.CSS_SELECTOR, "h2 a").get_attribute("href")
  184. except Exception as e:
  185. continue
  186. # 判重成功,跳过页面
  187. if collection.find_one({"url": href}):
  188. continue
  189. # 尝试打开网页
  190. try:
  191. time.sleep(10)
  192. # 句柄为 1,可进行点击操作
  193. if len(self.driver.window_handles) == 1:
  194. try:
  195. item.find_element(By.TAG_NAME, "h2").click()
  196. except Exception as e:
  197. try:
  198. element = item.find_element(By.TAG_NAME, "h2")
  199. self.driver.execute_script('arguments[0].click()', element)
  200. except Exception as e:
  201. logger.error(e)
  202. # 句柄不为 1,清理句柄
  203. else:
  204. # 遍历句柄
  205. for i in range(len(self.driver.window_handles) - 1):
  206. # 切换句柄
  207. self.driver.switch_to.window(self.driver.window_handles[1])
  208. # 关闭句柄
  209. self.driver.close()
  210. # 清理完毕,执行点击操作
  211. item.find_element(By.TAG_NAME, "h2").click()
  212. # 打开网页失败
  213. except Exception as e:
  214. logger.error(e)
  215. # 打开网页成功
  216. else:
  217. # 判断成功打开标签页
  218. if (len(self.driver.window_handles) == 2):
  219. # 切换标签页
  220. self.driver.switch_to.window(self.driver.window_handles[1])
  221. # 判断是否为文章
  222. if self.check_article():
  223. # 读文章
  224. self.parse_page_from_article(hotkey)
  225. time.sleep(10)
  226. finally:
  227. # 返回原始页
  228. self.driver.switch_to.window(self.driver.window_handles[0])
  229. self.driver.implicitly_wait(5)
  230. # 列表读取完毕,打开下一页
  231. # else:
  232. # # 打开下一页失败,直接退出
  233. # if not self.get_next_page_form_bing():
  234. # break
  235. # # 打开成功,全局等待加载
  236. # self.driver.implicitly_wait(5)
  237. # 切换到第一句柄
  238. self.driver.switch_to.window(self.driver.window_handles[0])
  239. def click_title_from_google(self, hotkey):
  240. """点击当前列表中的所有网页"""
  241. for epoch in range(10):
  242. # 遍历标题列表
  243. for index, item in enumerate(self.driver.find_elements(By.XPATH, '//*[@id="rso"]//div[@class="yuRUbf"]')):
  244. # 判断 url 是否已获取
  245. try:
  246. href = item.find_element(By.CSS_SELECTOR, "a").get_attribute("href")
  247. # pdf 文件
  248. if href.endswith('pdf'):
  249. continue
  250. except StaleElementReferenceException:
  251. # DOM 树结构改变,尝试重新获取
  252. try:
  253. item = self.driver.find_elements(By.XPATH, '//*[@id="rso"]//div[@class="yuRUbf"]')[index]
  254. href = item.find_element(By.CSS_SELECTOR, "a").get_attribute("href")
  255. except Exception as e:
  256. continue
  257. # 判重成功,跳过页面
  258. if collection.find_one({"url": href}):
  259. continue
  260. # 尝试打开网页
  261. try:
  262. time.sleep(10)
  263. # 句柄为 1,可进行点击操作
  264. if len(self.driver.window_handles) == 1:
  265. try:
  266. self.driver.execute_script('window.open("' + item.find_element(By.XPATH, "a").get_attribute("href") + '","_blank");')
  267. except Exception as e:
  268. logger.error(e)
  269. # try:
  270. # element = item.find_element(By.TAG_NAME, "h2")
  271. # self.driver.execute_script('arguments[0].click()', element)
  272. # except Exception as e:
  273. # logger.error(e)
  274. # 句柄不为 1,清理句柄
  275. else:
  276. # 遍历句柄
  277. for i in range(len(self.driver.window_handles) - 1):
  278. # 切换句柄
  279. self.driver.switch_to.window(self.driver.window_handles[1])
  280. # 关闭句柄
  281. self.driver.close()
  282. # 清理完毕,执行点击操作
  283. self.driver.execute_script('window.open("' + item.find_element(By.XPATH, "a").get_attribute("href") + '","_blank");')
  284. # 打开网页失败
  285. except Exception as e:
  286. logger.error(e)
  287. # 打开网页成功
  288. else:
  289. # 判断成功打开标签页
  290. if (len(self.driver.window_handles) == 2):
  291. # 切换标签页
  292. self.driver.switch_to.window(self.driver.window_handles[1])
  293. # 判断是否为文章
  294. if self.check_article():
  295. # 读文章
  296. self.parse_page_from_article(hotkey)
  297. time.sleep(10)
  298. finally:
  299. # 返回原始页
  300. self.driver.switch_to.window(self.driver.window_handles[0])
  301. self.driver.implicitly_wait(5)
  302. # 列表读取完毕,打开下一页
  303. else:
  304. # 打开下一页失败,直接退出
  305. if not self.get_next_page_form_google():
  306. break
  307. # 打开成功,全局等待加载
  308. self.driver.implicitly_wait(5)
  309. # 切换到第一句柄
  310. self.driver.switch_to.window(self.driver.window_handles[0])
  311. def parse_page_from_article(self, hotkey):
  312. """解析网页body"""
  313. # 滚动到底层
  314. try:
  315. height = self.driver.execute_script('return document.body.scrollHeight')
  316. if height > 1080:
  317. self.driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
  318. except TimeoutException as e:
  319. # 超时尝试停止加载
  320. try:
  321. self.driver.execute_script('window.stop ? window.stop() : document.execCommand("Stop");')
  322. except TimeoutException as e:
  323. logger.error('Timeout!')
  324. except Exception as e:
  325. logger.error(e)
  326. except UnexpectedAlertPresentExpection as e:
  327. logger.error(e)
  328. return
  329. except Exception as e:
  330. self.driver.close()
  331. logger.error(e)
  332. return
  333. page_source = ""
  334. url = ""
  335. for x in range(3):
  336. try:
  337. # 读取网页源码
  338. if not page_source:
  339. page_source = self.driver.page_source
  340. # 解析网页 url
  341. if not url:
  342. url = self.driver.current_url
  343. except TimeoutException as e:
  344. logger.info('Timeout!')
  345. except Exception as e:
  346. logger.error(e)
  347. if page_source:
  348. visible = False
  349. try:
  350. self.driver.execute_script(self.render)
  351. visible = True
  352. except Exception as e:
  353. logger.error(e)
  354. try:
  355. #if visible:
  356. # result = self.extract(page_source, use_visiable_info=True)
  357. #else:
  358. # result = self.extract(page_source)
  359. result = self.extract(page_source)
  360. except Exception as e:
  361. result = {"title":"", "author":"", "publish_time":"", "content":"", "images":[]}
  362. result['page_source'] = page_source
  363. else:
  364. self.driver.close()
  365. return
  366. if url:
  367. result['url'] = url
  368. else:
  369. self.driver.close()
  370. return
  371. if not result['title']:
  372. # 解析网页标题
  373. try:
  374. result['title'] = self.driver.title
  375. except Exception as e:
  376. logger.error(e)
  377. # 元数据
  378. metadata = dict([])
  379. for meta in self.driver.find_elements(By.TAG_NAME, "meta"):
  380. try:
  381. if meta.get_attribute("name"):
  382. metadata[meta.get_attribute("name")] = meta.get_attribute("content")
  383. except Exception as e:
  384. pass
  385. result['metadata'] = metadata
  386. if not result['content']:
  387. # 提取正文
  388. try:
  389. result['content'] = self.driver.find_element(By.XPATH, "//body").text
  390. except NoSuchElementException as e:
  391. self.driver.close()
  392. return
  393. except TimeoutException as e:
  394. self.driver.close()
  395. return
  396. except Exception as e:
  397. self.driver.close()
  398. logger.error(e)
  399. return
  400. result["crawl_time"] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  401. if self.check_useful(result):
  402. result['used'] = 0
  403. else:
  404. result["used"] = 1
  405. result['hotkey'] = hotkey
  406. # 存储
  407. try:
  408. logger.info(result)
  409. # 再次查重
  410. if not collection.find_one({"url":result["url"]}):
  411. collection.insert_one(result)
  412. except Exception as e:
  413. logger.error(e)
  414. finally:
  415. self.driver.close()
  416. return
  417. def check_article(self):
  418. """判断网页是否为文章"""
  419. return True
  420. def check_useful(self, result):
  421. """判断网页是否可用"""
  422. if (not result['content']) or (not result['title']) or (len(result['content']) < 100) or re.search(r"[\u4e00-\u9fa5]", result['title']) or ("403" in result["title"]) or ("404" in result["title"]) or ("502" in result["title"]):
  423. return False
  424. else:
  425. return True
  426. def start_crawl(self, query, engine):
  427. """启动爬虫"""
  428. if (engine == 'bing') and self.search_from_bing(query):
  429. self.click_title_from_bing(query)
  430. elif (engine == 'google') self.search_from_google(query):
  431. self.click_title_from_google(query)
  432. def close(self):
  433. """关闭爬虫"""
  434. self.driver.close()
  435. def main():
  436. # 关键词列表
  437. # with open('querys.txt', 'r', encoding='utf-8') as fp:
  438. # querys = fp.read()
  439. with open("querys.yaml", "r", encoding="utf-8") as fp:
  440. querys = yaml.safe_load(fp.read())
  441. with open("temp","r",encoding="utf-8") as ft:
  442. tk = ft.read()
  443. if "Adult education" in tk:
  444. tk = None
  445. if not tk:
  446. start = True
  447. else:
  448. start = False
  449. # 启动爬虫
  450. robot = AutoSpider()
  451. # 遍历关键词
  452. for key in querys.keys():
  453. for cont in querys[key][0].keys():
  454. for query in querys[key][0][cont]:
  455. if start:
  456. robot.start_crawl(cont+" "+query, "bing")
  457. with open("temp","w",encoding="utf-8") as ft:
  458. ft.write(key+cont+query)
  459. elif (key+cont+query == tk):
  460. start = True
  461. robot.start_crawl(cont+" "+query, "bing")
  462. with open("temp","w",encoding="utf-8") as ft:
  463. ft.write(key+cont+query)
  464. else:
  465. continue
  466. # 关闭爬虫
  467. robot.close()
  468. if __name__ == '__main__':
  469. main()