ewindpower.py 2.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647
  1. import time
  2. import scrapy
  3. from electric.items import ElectricItem
  4. # 国际风能网
  5. class EwpcSpider(scrapy.Spider):
  6. name = 'ewindpower'
  7. download_delay = 15
  8. allowed_domains = ['ewindpower.cn']
  9. start_urls = [
  10. 'http://www.ewindpower.cn/news/list-htm-catid-15-page-{}.html',
  11. 'http://www.ewindpower.cn/news/list-htm-catid-14-page-{}.html',
  12. ]
  13. def start_requests(self):
  14. yield scrapy.Request(url='http://www.ewindpower.cn/news/list-htm-catid-14.html', callback=self.parse_pages, errback=self.errback_httpbin)
  15. yield scrapy.Request(url='http://www.ewindpower.cn/news/list-htm-catid-15.html', callback=self.parse_pages, errback=self.errback_httpbin)
  16. for baseurl in self.start_urls:
  17. for page in range(2, 12):
  18. url = baseurl.format(page)
  19. self.logger.info('next page: {}'.format(url))
  20. yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin, dont_filter=True)
  21. def parse_pages(self, response):
  22. for item in response.css('div#iframe_11 > span > table li'):
  23. url = item.css('a').attrib['href']
  24. yield response.follow(url=url, callback=self.parse_items, errback=self.errback_httpbin)
  25. # nextpage = response.css('div.pages a[title="下一页"]').attrib['href']
  26. # self.logger.info('next page: {}'.format(nextpage))
  27. # yield response.follow(url=url, callback=self.parse_pages, errback=self.errback_httpbin)
  28. def parse_items(self, response):
  29. url = response.url
  30. title = response.css('title::text').get()
  31. source = '国际风能网'
  32. description = response.css('meta[name=description]::attr(content)').get()
  33. content = ''.join(response.xpath('//div[@class="content"]//text()').getall())
  34. date = time.time()
  35. column = '风能'
  36. self.logger.info(title)
  37. yield ElectricItem(url=url, title=title, source=source,
  38. description=description, content=content,
  39. date=date, column=column)
  40. def errback_httpbin(self, failure):
  41. self.logger.error(repr(failure))