education_spider.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349
  1. # -*- coding: utf-8 -*-
  2. # @Author: privacy
  3. # @Date: 2022-08-03 18:07:19
  4. # @Last Modified by: privacy
  5. # @Last Modified time: 2022-08-16 11:03:30
  6. #
  7. # 每小时一百个网页
  8. # 解析 url title author publish_time content images
  9. #
  10. import time
  11. from pymongo import MongoClient
  12. client = MongoClient("192.168.1.200", 27017)
  13. collection = client['education']['response']
  14. from logger import LoggerHandler
  15. logger = LoggerHandler(name="education")
  16. logger.set_file_handler(filename="education.log")
  17. from selenium import webdriver
  18. from selenium.webdriver.chrome.service import Service
  19. from selenium.common.exceptions import TimeoutException
  20. from selenium.common.exceptions import NoSuchElementException
  21. from selenium.common.exceptions import StaleElementReferenceException
  22. from selenium.webdriver.support import expected_conditions as EC
  23. from selenium.webdriver.support.ui import WebDriverWait
  24. from selenium.webdriver.support.wait import WebDriverWait
  25. from selenium.webdriver.common.by import By
  26. from selenium.webdriver.common.keys import Keys
  27. from selenium.webdriver.common.action_chains import ActionChains
  28. from gne import GeneralNewsExtractor
  29. class AutoSpider(object):
  30. """使用 selenium 在 bing 搜索引擎提取通用 body 文本数据"""
  31. def __init__(self):
  32. super(AutoSpider, self).__init__()
  33. # 服务
  34. service = Service(executable_path="chromedriver.exe")
  35. # 选项
  36. options = webdriver.ChromeOptions()
  37. # 无界面运行
  38. options.add_argument('--headless')
  39. # 以最高权限运行
  40. options.add_argument('--no-sandbox')
  41. # 配置代理
  42. # options.add_argument('proxy-server={}'.format(self.proxy_server))
  43. # 直接配置大小和set_window_size一样
  44. options.add_argument('--window-size={},{}'.format(1920, 1080))
  45. # 语言
  46. options.add_argument('--lang=en')
  47. # 禁用扩展
  48. options.add_argument('--disable-extensions')
  49. # options.add_argument('--disable-infobars')
  50. # 忽略证书错误
  51. options.add_argument('--ignore-certificate-errors')
  52. # 禁止通知
  53. options.add_argument('--disable-notifications')
  54. options.add_argument('--force-device-scale-factor=1')
  55. options.add_argument('--disable-dev-shm-usage')
  56. # 禁用浏览器侧导航
  57. options.add_argument('--disable-browser-side-navigation')
  58. # 不加载图片, 提升速度
  59. options.add_argument('blink-settings=imagesEnabled=false')
  60. # info:0 warning:1 error:2 fail:3
  61. options.add_argument('log-level=2')
  62. options.add_argument("--disable-blink-features=AutomationControlled")
  63. # 设置开发者模式启动,该模式下webdriver属性为正常值
  64. options.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging'])
  65. options.add_experimental_option('useAutomationExtension', False)
  66. # 禁用浏览器弹窗
  67. prefs = {
  68. 'profile.default_content_setting_values': {
  69. 'notifications': 2
  70. }
  71. }
  72. options.add_experimental_option("prefs", prefs)
  73. self.driver = webdriver.Chrome(options=options, service=service)
  74. # 注入防检测脚本
  75. with open('./stealth.min.js') as fj:
  76. js = fj.read()
  77. self.driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": js})
  78. # 设置网页超时
  79. self.driver.set_page_load_timeout(60)
  80. self.driver.set_script_timeout(60)
  81. self.extractor = GeneralNewsExtractor()
  82. self.extract = self.extractor.extract
  83. def search_from_bing(self, search_query):
  84. """使用必应英文搜索"""
  85. try:
  86. self.driver.get('https://cn.bing.com/')
  87. except:
  88. self.driver.execute_script('window.stop()')
  89. logger.info("page timeout!")
  90. WebDriverWait(self.driver, 180, 5).until(EC.visibility_of_element_located((By.ID,'sb_form_q')))
  91. self.driver.find_element(By.ID, "sb_form_q").send_keys(search_query)
  92. time.sleep(3)
  93. self.driver.implicitly_wait(3)
  94. self.driver.find_element(By.ID, "sb_form_q").send_keys(Keys.ENTER)
  95. time.sleep(4)
  96. self.driver.implicitly_wait(3)
  97. WebDriverWait(self.driver, 180, 10).until(EC.visibility_of_element_located((By.ID,'est_en')))
  98. self.driver.find_element(By.ID, "est_en").click()
  99. time.sleep(3)
  100. self.driver.implicitly_wait(3)
  101. return True
  102. def get_next_page_form_bing(self):
  103. """必应点击下一页"""
  104. try:
  105. self.driver.refresh()
  106. time.sleep(3)
  107. self.driver.implicitly_wait(3)
  108. # 有下一页按钮
  109. if self.driver.find_element(By.CSS_SELECTOR, "div.sw_next").text:
  110. self.driver.find_element(By.CSS_SELECTOR, "div.sw_next").click()
  111. # 没有下一页按钮
  112. elif self.driver.find_element(By.XPATH, "//a[@class='sb_pagS sb_pagS_bp b_widePag sb_bp']/parent::li/following-sibling::*").text:
  113. self.driver.find_element(By.XPATH, "//a[@class='sb_pagS sb_pagS_bp b_widePag sb_bp']/parent::li/following-sibling::*").click()
  114. # 页面加载错误
  115. else:
  116. return False
  117. # 返回成功
  118. return True
  119. except Exception as e:
  120. logger.error(e)
  121. return False
  122. def click_title_from_list(self):
  123. """点击当前列表中的所有网页"""
  124. for epoch in range(5):
  125. # 遍历标题列表
  126. for index, item in enumerate(self.driver.find_elements(By.CSS_SELECTOR, "#b_results li.b_algo")):
  127. # 判断 url 是否已获取
  128. try:
  129. href = item.find_element(By.CSS_SELECTOR, "h2 a").get_attribute("href")
  130. # pdf 文件
  131. if href.endswith('pdf'):
  132. continue
  133. except StaleElementReferenceException as e:
  134. # DOM 树结构改变,尝试重新获取
  135. try:
  136. item = self.driver.find_element(By.XPATH, "//*[@id='b_results']//li[{}]".format(index))
  137. href = item.find_element(By.CSS_SELECTOR, "h2 a").get_attribute("href")
  138. except Exception as e:
  139. continue
  140. # 判重成功,跳过页面
  141. if collection.find_one({"url": href}):
  142. continue
  143. # 尝试打开网页
  144. try:
  145. time.sleep(10)
  146. # 句柄为 1,可进行点击操作
  147. if len(self.driver.window_handles) == 1:
  148. try:
  149. item.find_element(By.TAG_NAME, "h2").click()
  150. except Exception as e:
  151. try:
  152. element = item.find_element(By.TAG_NAME, "h2")
  153. self.driver.execute_script('arguments[0].click()', element)
  154. except Exception as e:
  155. logger.error(e)
  156. # 句柄不为 1,清理句柄
  157. else:
  158. # 遍历句柄
  159. for i in range(len(self.driver.window_handles) - 1):
  160. # 切换句柄
  161. self.driver.switch_to.window(self.driver.window_handles[1])
  162. # 关闭句柄
  163. self.driver.close()
  164. # 清理完毕,执行点击操作
  165. item.find_element(By.TAG_NAME, "h2").click()
  166. # 打开网页失败
  167. except Exception as e:
  168. logger.error(e)
  169. # 打开网页成功
  170. else:
  171. # 判断成功打开标签页
  172. if (len(self.driver.window_handles) == 2):
  173. # 切换标签页
  174. self.driver.switch_to.window(self.driver.window_handles[1])
  175. # 判断是否为文章
  176. if self.check_article():
  177. # 读文章
  178. self.parse_page_from_article()
  179. time.sleep(10)
  180. finally:
  181. # 返回原始页
  182. self.driver.switch_to.window(self.driver.window_handles[0])
  183. self.driver.implicitly_wait(5)
  184. # 列表读取完毕,打开下一页
  185. else:
  186. # 打开下一页失败,直接退出
  187. if not self.get_next_page_form_bing():
  188. break
  189. # 打开成功,全局等待加载
  190. self.driver.implicitly_wait(5)
  191. # 切换到第一句柄
  192. self.driver.switch_to.window(self.driver.window_handles[0])
  193. def parse_page_from_article(self):
  194. """解析网页body"""
  195. # 滚动到底层
  196. try:
  197. height = self.driver.execute_script('return document.body.scrollHeight')
  198. if height > 1080:
  199. self.driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
  200. except TimeoutException as e:
  201. # 超时尝试停止加载
  202. try:
  203. self.driver.execute_script('window.stop ? window.stop() : document.execCommand("Stop");')
  204. except TimeoutException as e:
  205. logger.error('Timeout!')
  206. except Exception as e:
  207. logger.error(e)
  208. except Exception as e:
  209. self.driver.close()
  210. logger.error(e)
  211. return
  212. page_source = ""
  213. url = ""
  214. for x in range(3):
  215. try:
  216. # 读取网页源码
  217. if not page_source:
  218. page_source = self.driver.page_source
  219. # 解析网页 url
  220. if not url:
  221. url = self.driver.current_url
  222. except TimeoutException as e:
  223. logger.info('Timeout!')
  224. except Exception as e:
  225. logger.error(e)
  226. if page_source:
  227. try:
  228. result = self.extract(page_source)
  229. except Exception as e:
  230. result = {"title":"", "author":"", "publish_time":"", "content":"", "images":[]}
  231. result['page_source'] = page_source
  232. else:
  233. self.driver.close()
  234. return
  235. if url:
  236. result['url'] = url
  237. else:
  238. self.driver.close()
  239. return
  240. if not result['title']:
  241. # 解析网页标题
  242. try:
  243. result['title'] = self.driver.title
  244. except Exception as e:
  245. logger.error(e)
  246. # 元数据
  247. metadata = dict([])
  248. try:
  249. for meta in self.driver.find_elements(By.TAG_NAME, "meta"):
  250. try:
  251. if meta.get_attribute("name"):
  252. metadata[meta.get_attribute("name")] = meta.get_attribute("content")
  253. except:
  254. pass
  255. except Exception as e:
  256. result['metadata'] = []
  257. else:
  258. result['metadata'] = metadata
  259. if not result['content']:
  260. # 提取正文
  261. try:
  262. result['content'] = self.driver.find_element(By.XPATH, "//body").text
  263. except NoSuchElementException as e:
  264. self.driver.close()
  265. return
  266. except TimeoutException as e:
  267. self.driver.close()
  268. return
  269. except Exception as e:
  270. self.driver.close()
  271. logger.error(e)
  272. return
  273. result["crawl_time"] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
  274. result["used"] = 0
  275. # 存储
  276. try:
  277. logger.info(result)
  278. collection.insert_one(result)
  279. except Exception as e:
  280. logger.error(e)
  281. finally:
  282. self.driver.close()
  283. return
  284. def check_article(self):
  285. """判断网页是否为文章"""
  286. return True
  287. def start_crawl(self, query):
  288. """启动爬虫"""
  289. if self.search_from_bing(query):
  290. self.click_title_from_list()
  291. def close(self):
  292. """关闭爬虫"""
  293. self.driver.close()
  294. def main():
  295. # 关键词列表
  296. with open('querys.txt', 'r', encoding='utf-8') as fp:
  297. querys = fp.read()
  298. # 启动爬虫
  299. robot = AutoSpider()
  300. # 遍历关键词
  301. for query in querys.split('\n'):
  302. robot.start_crawl(query)
  303. # 关闭爬虫
  304. robot.close()
  305. if __name__ == '__main__':
  306. main()