education_spider_render_v3.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2022-08-03 18:07:19
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2022-11-10 14:33:05
  6. #
  7. # 每小时一百个网页
  8. # 添加使用新闻内容解析功能
  9. # 解析 url title author publish_time content images
  10. # js 渲染, 添加每个节点的位置信息,提升判断文章内容方法
  11. # 已完成
  12. import time
  13. import re
  14. import yaml
  15. from pymongo import MongoClient
  16. client = MongoClient("192.168.1.200", 27017)
  17. # collection = client['education']['response']
  18. # collection = client['education']['hallowmas']
  19. collection = client['education']['news']
  20. from logger import LoggerHandler
  21. logger = LoggerHandler(name="education")
  22. logger.set_file_handler(filename="education.log")
  23. from selenium import webdriver
  24. from webdriver_manager.chrome import ChromeDriverManager
  25. from selenium.webdriver.chrome.service import Service
  26. from selenium.common.exceptions import TimeoutException
  27. from selenium.common.exceptions import NoSuchElementException
  28. from selenium.common.exceptions import StaleElementReferenceException
  29. #from selenium.common.exceptions import UnexpectedAlertPresentExpection
  30. from selenium.webdriver.support import expected_conditions as EC
  31. from selenium.webdriver.support.ui import WebDriverWait
  32. from selenium.webdriver.support.wait import WebDriverWait
  33. from selenium.webdriver.common.by import By
  34. from selenium.webdriver.common.keys import Keys
  35. from selenium.webdriver.common.action_chains import ActionChains
  36. from gne import GeneralNewsExtractor
  37. import joblib
  38. import nltk
  39. from nltk.stem import WordNetLemmatizer
  40. from nltk.corpus import stopwords
  41. class AutoSpider(object):
  42. """使用 selenium 在 bing 搜索引擎提取通用 body 文本数据"""
  43. def __init__(self):
  44. super(AutoSpider, self).__init__()
  45. # 服务
  46. service = Service(executable_path=ChromeDriverManager().install())
  47. # 选项
  48. options = webdriver.ChromeOptions()
  49. # 无界面运行
  50. options.add_argument('--headless')
  51. # 以最高权限运行
  52. options.add_argument('--no-sandbox')
  53. # 配置代理
  54. # options.add_argument('proxy-server={}'.format(self.proxy_server))
  55. # 直接配置大小和set_window_size一样
  56. options.add_argument('--window-size={},{}'.format(1920, 1080))
  57. # 语言
  58. options.add_argument('--lang=en')
  59. # 禁用扩展
  60. options.add_argument('--disable-extensions')
  61. # options.add_argument('--disable-infobars')
  62. # 忽略证书错误
  63. options.add_argument('--ignore-certificate-errors')
  64. # 禁止通知
  65. options.add_argument('--disable-notifications')
  66. options.add_argument('--force-device-scale-factor=1')
  67. options.add_argument('--disable-dev-shm-usage')
  68. # 禁用浏览器侧导航
  69. options.add_argument('--disable-browser-side-navigation')
  70. # 不加载图片, 提升速度
  71. options.add_argument('blink-settings=imagesEnabled=false')
  72. # info:0 warning:1 error:2 fail:3
  73. options.add_argument('log-level=2')
  74. options.add_argument("--disable-blink-features=AutomationControlled")
  75. # 设置开发者模式启动,该模式下webdriver属性为正常值
  76. options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging'])
  77. options.add_experimental_option('useAutomationExtension', False)
  78. # 禁用浏览器弹窗
  79. prefs = {
  80. 'profile.default_content_setting_values': {
  81. 'notifications': 2
  82. }
  83. }
  84. options.add_experimental_option("prefs", prefs)
  85. self.driver = webdriver.Chrome(options=options, service=service)
  86. # 注入防检测脚本
  87. with open('./stealth.min.js') as fj:
  88. js = fj.read()
  89. self.driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": js})
  90. # 设置网页超时
  91. self.driver.set_page_load_timeout(60)
  92. self.driver.set_script_timeout(60)
  93. self.extractor = GeneralNewsExtractor()
  94. self.extract = self.extractor.extract
  95. with open('render.js') as fr:
  96. self.render = fr.read()
  97. self.model = joblib.load("NLP/SVC.joblib")
  98. self.tfidf = joblib.load("NLP/TFIDF.joblib")
  99. def search_from_bing(self, search_query):
  100. """使用必应英文搜索"""
  101. try:
  102. self.driver.get('https://cn.bing.com/')
  103. except:
  104. self.driver.execute_script('window.stop()')
  105. logger.info("page timeout!")
  106. with open("url.txt","r",encoding="utf-8") as fu:
  107. url = fu.read()
  108. if url:
  109. try:
  110. self.driver.get(url)
  111. self.driver.implicitly_wait(3)
  112. except:
  113. self.driver.execute_script('window.stop')
  114. logger.info("page timeout!")
  115. WebDriverWait(self.driver, 180, 10).until(EC.element_to_be_clickable((By.ID,'est_en')))
  116. self.driver.find_element(By.ID, "est_en").click()
  117. time.sleep(3)
  118. self.driver.implicitly_wait(3)
  119. return True
  120. WebDriverWait(self.driver, 180, 5).until(EC.visibility_of_element_located((By.ID,'sb_form_q')))
  121. self.driver.find_element(By.ID, "sb_form_q").send_keys(search_query + " AND (language:en)")
  122. time.sleep(3)
  123. self.driver.implicitly_wait(3)
  124. self.driver.find_element(By.ID, "sb_form_q").send_keys(Keys.ENTER)
  125. time.sleep(4)
  126. self.driver.implicitly_wait(3)
  127. WebDriverWait(self.driver, 180, 10).until(EC.element_to_be_clickable((By.ID,'est_en')))
  128. self.driver.find_element(By.ID, "est_en").click()
  129. time.sleep(3)
  130. WebDriverWait(self.driver, 180, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "div#b_tween a.ftrH")))
  131. self.driver.find_element(By.CSS_SELECTOR, "div#b_tween a.ftrH").click()
  132. WebDriverWait(self.driver, 180, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="ftrD_Any_time"]/a[3]/i')))
  133. self.driver.find_element(By.XPATH, '//*[@id="ftrD_Any_time"]/a[3]/i').click()
  134. self.driver.implicitly_wait(3)
  135. return True
  136. def search_from_google(self, search_query):
  137. try:
  138. self.driver.get("https://www.google.com")
  139. except:
  140. self.driver.execute_script('window.stop()')
  141. logger.info("page timeout")
  142. WebDriverWait(self.driver, 180, 5).until(EC.visibility_of_element_located((By.NAME, "q")))
  143. self.driver.find_element(By.NAME, "q").send_keys(search_query)
  144. time.sleep(3)
  145. self.driver.implicitly_wait(3)
  146. self.driver.find_element(By.NAME, "q").send_keys(Keys.ENTER)
  147. time.sleep(4)
  148. self.driver.implicitly_wait(3)
  149. return True
  150. def get_next_page_form_bing(self):
  151. """必应点击下一页"""
  152. try:
  153. self.driver.refresh()
  154. time.sleep(3)
  155. self.driver.implicitly_wait(3)
  156. # 有下一页按钮
  157. if self.driver.find_element(By.CSS_SELECTOR, "div.sw_next").text:
  158. self.driver.find_element(By.CSS_SELECTOR, "div.sw_next").click()
  159. # 没有下一页按钮
  160. elif self.driver.find_element(By.XPATH, "//a[@class='sb_pagS sb_pagS_bp b_widePag sb_bp']/parent::li/following-sibling::*").text:
  161. self.driver.find_element(By.XPATH, "//a[@class='sb_pagS sb_pagS_bp b_widePag sb_bp']/parent::li/following-sibling::*").click()
  162. # 页面加载错误
  163. else:
  164. return False
  165. try:
  166. time.sleep(5)
  167. nowpage = self.driver.current_url
  168. if (int(re.findall(r"&first=(\d+)&", nowpage)[0]) > 200):
  169. return False
  170. except:
  171. pass
  172. logger.info(self.driver.current_url)
  173. # 返回成功
  174. with open("url.txt", "w", encoding="utf-8") as fu:
  175. fu.write(self.driver.current_url)
  176. return True
  177. except NoSuchElementException as e:
  178. logger.info("已到末尾!")
  179. return False
  180. except Exception as e:
  181. logger.error(e)
  182. return False
  183. def get_next_page_form_google(self):
  184. """谷歌点击下一页"""
  185. try:
  186. self.driver.refresh()
  187. time.sleep(3)
  188. self.driver.implicitly_wait(3)
  189. # 有下一页按钮
  190. if self.driver.find_element(By.ID, "pnnext").text:
  191. self.driver.find_element(By.ID, "pnnext").click()
  192. # 没有下一页按钮
  193. # elif self.driver.find_element(By.XPATH, "//a[@class='sb_pagS sb_pagS_bp b_widePag sb_bp']/parent::li/following-sibling::*").text:
  194. # self.driver.find_element(By.XPATH, "//a[@class='sb_pagS sb_pagS_bp b_widePag sb_bp']/parent::li/following-sibling::*").click()
  195. # 页面加载错误
  196. else:
  197. return False
  198. # 返回成功
  199. return True
  200. except Exception as e:
  201. logger.error(e)
  202. return False
  203. def click_title_from_bing(self, hotkey):
  204. """点击当前列表中的所有网页"""
  205. for epoch in range(10):
  206. # 遍历标题列表
  207. for index, item in enumerate(self.driver.find_elements(By.CSS_SELECTOR, "#b_results li.b_algo")):
  208. # 判断 url 是否已获取
  209. publish_time = ""
  210. try:
  211. href = item.find_element(By.CSS_SELECTOR, "h2 a").get_attribute("href")
  212. try:
  213. publish_time = item.find_element(By.CSS_SELECTOR, "span.news_dt").text
  214. except:
  215. pass
  216. # pdf 文件
  217. if href.endswith('pdf') or href.endswith('doc'):
  218. continue
  219. except StaleElementReferenceException as e:
  220. # DOM 树结构改变,尝试重新获取
  221. try:
  222. item = self.driver.find_element(By.XPATH, "//*[@id='b_results']//li[{}]".format(index+1))
  223. href = item.find_element(By.CSS_SELECTOR, "h2 a").get_attribute("href")
  224. try:
  225. publish_time = item.find_element(By.CSS_SELECTOR, "span.news_dt").text
  226. except:
  227. pass
  228. except Exception as e:
  229. continue
  230. # 判重成功,跳过页面
  231. if collection.find_one({"url": href}):
  232. print("重复数据")
  233. continue
  234. # 尝试打开网页
  235. try:
  236. time.sleep(10)
  237. # 句柄为 1,可进行点击操作
  238. if len(self.driver.window_handles) == 1:
  239. try:
  240. item.find_element(By.TAG_NAME, "h2").click()
  241. except Exception as e:
  242. try:
  243. element = item.find_element(By.TAG_NAME, "h2")
  244. self.driver.execute_script('arguments[0].click()', element)
  245. except Exception as e:
  246. logger.error(e)
  247. # 句柄不为 1,清理句柄
  248. else:
  249. # 遍历句柄
  250. for i in range(len(self.driver.window_handles) - 1):
  251. # 切换句柄
  252. self.driver.switch_to.window(self.driver.window_handles[1])
  253. # 关闭句柄
  254. self.driver.close()
  255. # 清理完毕,执行点击操作
  256. item.find_element(By.TAG_NAME, "h2").click()
  257. # 打开网页失败
  258. except Exception as e:
  259. logger.error(e)
  260. # 打开网页成功
  261. else:
  262. # 判断成功打开标签页
  263. if (len(self.driver.window_handles) == 2):
  264. # 切换标签页
  265. self.driver.switch_to.window(self.driver.window_handles[1])
  266. # 判断是否为文章
  267. if self.check_article():
  268. # 读文章
  269. self.parse_page_from_article(hotkey, publish_time)
  270. time.sleep(10)
  271. finally:
  272. # 返回原始页
  273. self.driver.switch_to.window(self.driver.window_handles[0])
  274. self.driver.implicitly_wait(5)
  275. # 列表读取完毕,打开下一页
  276. else:
  277. # 打开下一页失败,直接退出
  278. if not self.get_next_page_form_bing():
  279. break
  280. # 打开成功,全局等待加载
  281. self.driver.implicitly_wait(5)
  282. # 切换到第一句柄
  283. self.driver.switch_to.window(self.driver.window_handles[0])
  284. def click_title_from_google(self, hotkey):
  285. """点击当前列表中的所有网页"""
  286. for epoch in range(10):
  287. # 遍历标题列表
  288. for index, item in enumerate(self.driver.find_elements(By.XPATH, '//*[@id="rso"]//div[@class="yuRUbf"]')):
  289. # 判断 url 是否已获取
  290. try:
  291. href = item.find_element(By.CSS_SELECTOR, "a").get_attribute("href")
  292. # pdf 文件
  293. if href.endswith('pdf'):
  294. continue
  295. except StaleElementReferenceException:
  296. # DOM 树结构改变,尝试重新获取
  297. try:
  298. item = self.driver.find_elements(By.XPATH, '//*[@id="rso"]//div[@class="yuRUbf"]')[index]
  299. href = item.find_element(By.CSS_SELECTOR, "a").get_attribute("href")
  300. except Exception as e:
  301. continue
  302. # 判重成功,跳过页面
  303. if collection.find_one({"url": href}):
  304. continue
  305. # 尝试打开网页
  306. try:
  307. time.sleep(10)
  308. # 句柄为 1,可进行点击操作
  309. if len(self.driver.window_handles) == 1:
  310. try:
  311. self.driver.execute_script('window.open("' + item.find_element(By.XPATH, "a").get_attribute("href") + '","_blank");')
  312. except Exception as e:
  313. logger.error(e)
  314. # try:
  315. # element = item.find_element(By.TAG_NAME, "h2")
  316. # self.driver.execute_script('arguments[0].click()', element)
  317. # except Exception as e:
  318. # logger.error(e)
  319. # 句柄不为 1,清理句柄
  320. else:
  321. # 遍历句柄
  322. for i in range(len(self.driver.window_handles) - 1):
  323. # 切换句柄
  324. self.driver.switch_to.window(self.driver.window_handles[1])
  325. # 关闭句柄
  326. self.driver.close()
  327. # 清理完毕,执行点击操作
  328. self.driver.execute_script('window.open("' + item.find_element(By.XPATH, "a").get_attribute("href") + '","_blank");')
  329. # 打开网页失败
  330. except Exception as e:
  331. logger.error(e)
  332. # 打开网页成功
  333. else:
  334. # 判断成功打开标签页
  335. if (len(self.driver.window_handles) == 2):
  336. # 切换标签页
  337. self.driver.switch_to.window(self.driver.window_handles[1])
  338. # 判断是否为文章
  339. if self.check_article():
  340. # 读文章
  341. self.parse_page_from_article(hotkey)
  342. time.sleep(10)
  343. finally:
  344. # 返回原始页
  345. self.driver.switch_to.window(self.driver.window_handles[0])
  346. self.driver.implicitly_wait(5)
  347. # 列表读取完毕,打开下一页
  348. else:
  349. # 打开下一页失败,直接退出
  350. if not self.get_next_page_form_google():
  351. break
  352. # 打开成功,全局等待加载
  353. self.driver.implicitly_wait(5)
  354. # 切换到第一句柄
  355. self.driver.switch_to.window(self.driver.window_handles[0])
  356. def parse_page_from_article(self, hotkey, pbt):
  357. """解析网页body"""
  358. # 滚动到底层
  359. try:
  360. height = self.driver.execute_script('return document.body.scrollHeight')
  361. if height > 1080:
  362. self.driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
  363. except TimeoutException as e:
  364. # 超时尝试停止加载
  365. try:
  366. self.driver.execute_script('window.stop ? window.stop() : document.execCommand("Stop");')
  367. except TimeoutException as e:
  368. logger.error('Timeout!')
  369. except Exception as e:
  370. logger.error(e)
  371. except UnexpectedAlertPresentExpection as e:
  372. logger.error(e)
  373. return
  374. except Exception as e:
  375. self.driver.close()
  376. logger.error(e)
  377. return
  378. page_source = ""
  379. url = ""
  380. for x in range(3):
  381. try:
  382. # 读取网页源码
  383. if not page_source:
  384. page_source = self.driver.page_source
  385. # 解析网页 url
  386. if not url:
  387. url = self.driver.current_url
  388. except TimeoutException as e:
  389. logger.info('Timeout!')
  390. except Exception as e:
  391. logger.error(e)
  392. if page_source:
  393. visible = False
  394. try:
  395. self.driver.execute_script(self.render)
  396. visible = True
  397. except Exception as e:
  398. logger.error(e)
  399. try:
  400. #if visible:
  401. # result = self.extract(page_source, use_visiable_info=True)
  402. #else:
  403. # result = self.extract(page_source)
  404. result = self.extract(page_source)
  405. except Exception as e:
  406. result = {"title":"", "author":"", "publish_time":"", "content":"", "images":[]}
  407. result['page_source'] = page_source
  408. else:
  409. self.driver.close()
  410. return
  411. if url:
  412. result['url'] = url
  413. else:
  414. self.driver.close()
  415. return
  416. if not result['title']:
  417. # 解析网页标题
  418. try:
  419. result['title'] = self.driver.title
  420. except Exception as e:
  421. logger.error(e)
  422. # 元数据
  423. metadata = dict([])
  424. for meta in self.driver.find_elements(By.TAG_NAME, "meta"):
  425. try:
  426. if meta.get_attribute("name"):
  427. metadata[meta.get_attribute("name")] = meta.get_attribute("content")
  428. except Exception as e:
  429. pass
  430. result['metadata'] = metadata
  431. if not result['content']:
  432. # 提取正文
  433. try:
  434. result['content'] = self.driver.find_element(By.XPATH, "//body").text
  435. except NoSuchElementException as e:
  436. self.driver.close()
  437. return
  438. except TimeoutException as e:
  439. self.driver.close()
  440. return
  441. except Exception as e:
  442. self.driver.close()
  443. logger.error(e)
  444. return
  445. result["crawl_time"] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  446. if self.check_useful(result):
  447. result['used'] = 0
  448. else:
  449. result["used"] = 1
  450. result['hotkey'] = hotkey
  451. if pbt:
  452. print(pbt)
  453. timedict = {"Dec": "12", "Nov": "11", "Oct": "10", "Sep": "9", "Aug": "8", "Jul": "7", "Jun": "6", "May": "5", "Apr": "4", "Mar": "3", "Feb": "2", "Jan": "1"}
  454. rt = re.findall(r"(\w+)\s?(\d+)\S?\s?(\d+)", pbt)
  455. if rt:
  456. print(rt)
  457. pbt = str(rt[0][2]) + '-' + timedict[rt[0][0]] + '-' + str(rt[0][1])
  458. result['publish_time'] = pbt
  459. # 存储
  460. try:
  461. # logger.info(result)
  462. # 再次查重
  463. if not collection.find_one({"url":result["url"]}):
  464. collection.insert_one(result)
  465. except Exception as e:
  466. logger.error(e)
  467. finally:
  468. self.driver.close()
  469. return
  470. def check_article(self):
  471. """判断网页是否为文章"""
  472. return True
  473. def check_useful(self, result):
  474. """判断网页是否可用"""
  475. if (not result['content']) or (not result['title']) or (len(result['content']) < 200) or re.search(r"[\u4e00-\u9fa5]", result['title']) or ("403" in result["title"]) or ("404" in result["title"]) or ("502" in result["title"]) or ("Just a moment..." in result["title"]) or ('www.linkedin.cn' in result['url']):
  476. return False
  477. if self.model.predict(self.tfidf.transform([self.preprocessing(result["content"])]))[0] == 1:
  478. return True
  479. else:
  480. return False
  481. def preprocessing(self, text):
  482. # text = text.decode("utf-8")
  483. tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
  484. stops = stopwords.words('english')
  485. tokens = [token for token in tokens if token not in stops]
  486. tokens = [token.lower() for token in tokens if len(token)>=3]
  487. lmtzr = WordNetLemmatizer()
  488. tokens = [lmtzr.lemmatize(token) for token in tokens]
  489. preprocessed_text = ' '.join(tokens)
  490. return preprocessed_text
  491. def start_crawl(self, query, engine):
  492. """启动爬虫"""
  493. if (engine == 'bing') and self.search_from_bing(query):
  494. self.click_title_from_bing(query)
  495. elif (engine == 'google') and self.search_from_google(query):
  496. self.click_title_from_google(query)
  497. def close(self):
  498. """关闭爬虫"""
  499. self.driver.close()
  500. def main(engine="bing"):
  501. # 关键词列表
  502. with open("querys.yaml", "r", encoding="utf-8") as fp:
  503. querys = yaml.safe_load(fp.read())
  504. with open("temp","r",encoding="utf-8") as ft:
  505. tk = ft.read()
  506. if "Adult education" in tk:
  507. tk = None
  508. if not tk:
  509. start = True
  510. else:
  511. start = False
  512. # 启动爬虫
  513. robot = AutoSpider()
  514. # 遍历关键词
  515. for key in querys.keys():
  516. for cont in querys[key][0].keys():
  517. for query in querys[key][0][cont]:
  518. if start:
  519. robot.start_crawl(key + " AND " + cont + " AND " + query, engine)
  520. with open("temp", "w", encoding="utf-8") as ft:
  521. ft.write(key + cont + query)
  522. elif (key+cont+query == tk):
  523. start = True
  524. robot.start_crawl(key + " AND " + cont + " AND " + query, engine)
  525. with open("temp", "w", encoding="utf-8") as ft:
  526. ft.write(key + cont + query)
  527. else:
  528. continue
  529. with open('url.txt', 'w', encoding='utf-8') as fu:
  530. fu.write('')
  531. # 关闭爬虫
  532. robot.close()
  533. if __name__ == '__main__':
  534. main('bing')