newenergy.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263
  1. import time
  2. import scrapy
  3. from electric.items import ElectricItem
  4. # 新能源网
  5. class NgccSpider(scrapy.Spider):
  6. name = 'newenergy'
  7. allowed_domains = ['newenergy.giec.cas.cn']
  8. start_urls = [
  9. ('http://newenergy.giec.cas.cn/tyn/xydt/index{}.html', 31, '光伏'),
  10. ('http://newenergy.giec.cas.cn/tyn/jrjj/index{}.html', 31, '光伏'),
  11. ('http://newenergy.giec.cas.cn/tyn/gfdt/index{}.html', 31, '光伏'),
  12. ('http://newenergy.giec.cas.cn/tyn/grdt/index{}.html', 15, '光伏'),
  13. ('http://newenergy.giec.cas.cn/tyn/jcyy/index{}.html', 31, '光伏'),
  14. ('http://newenergy.giec.cas.cn/tyn/cpyjs/index{}.html', 31, '光伏'),
  15. ('http://newenergy.giec.cas.cn/fn/jrjj_15706/index{}.html', 31, '风能'),
  16. ('http://newenergy.giec.cas.cn/fn/fndt/index{}.html', 31, '风能'),
  17. ('http://newenergy.giec.cas.cn/fn/cydt/index{}.html', 31, '风能'),
  18. ('http://newenergy.giec.cas.cn/fn/fnzy/index{}.html', 2, '风能'),
  19. ('http://newenergy.giec.cas.cn/fn/cpyjs_15710/index{}.html', 23, '风能'),
  20. ('http://newenergy.giec.cas.cn/xsdt/index{}.html', 39, ''),
  21. ]
  22. custom_settings = {
  23. 'DOWNLOAD_DELAY': 6,
  24. 'DOWNLOAD_TIMEOUT': 1800,
  25. }
  26. def start_requests(self):
  27. for baseurl, maxpage, tag in self.start_urls:
  28. url = baseurl.format("")
  29. yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin, dont_filter=True, meta={"url": baseurl.format('_{}'), "maxpage": maxpage, "nextpage": 1, "tag":tag})
  30. def parse_pages(self, response):
  31. try:
  32. for item in response.xpath('//ul[@class="list_article"]/li'):
  33. url = item.css('a').attrib['href']
  34. yield response.follow(url=url, callback=self.parse_item, errback=self.errback_httpbin, meta={"url": url, "tag": response.meta['tag']})
  35. except:
  36. try:
  37. for item in response.xpath('//div/table[not(@class)]//tr'):
  38. url = item.css('a').attrib['href']
  39. yield response.follow(url=url, callback=self.parse_item, errback=self.errback_httpbin, meta={"url": url, "tag": response.meta['tag']})
  40. except:
  41. pass
  42. if response.meta['nextpage'] <= response.meta['maxpage']:
  43. nextpage = response.meta['url'].format(response.meta['nextpage'])
  44. yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin, meta={'url': response.meta['url'], "nextpage": response.meta['nextpage'] + 1, 'maxpage': response.meta['maxpage'], "tag": response.meta['tag']})
  45. def parse_item(self, response):
  46. url = response.url
  47. title = response.css('title::text').get()
  48. source = '新能源网'
  49. description = ''
  50. content = ''.join(response.xpath('//div[@class="TRS_Editor"]//text()').getall())
  51. date = time.time()
  52. column = response.meta['tag']
  53. yield ElectricItem(url=url, title=title, source=source,
  54. description=description, content=content,
  55. date=date, column=column)
  56. def errback_httpbin(self, failure):
  57. self.logger.error(repr(failure))