|
@@ -0,0 +1,41 @@
|
|
|
+import time
|
|
|
+import scrapy
|
|
|
+from electric.items import ElectricItem
|
|
|
+
|
|
|
+# 风能产业网
|
|
|
+class CweeaSpider(scrapy.Spider):
|
|
|
+ name = 'ppcc'
|
|
|
+ allowed_domains = ['paper.people.com.cn']
|
|
|
+ start_urls = ['http://paper.people.com.cn/rmrb/index.html',
|
|
|
+ 'http://paper.people.com.cn/zgnyb/paperindex.htm']
|
|
|
+
|
|
|
+ custom_settings = {
|
|
|
+ 'DOWNLOAD_DELAY': 20,
|
|
|
+ }
|
|
|
+
|
|
|
+ def start_requests(self):
|
|
|
+ for url in self.start_urls:
|
|
|
+ yield scrapy.Request(url=url, callback=self.parse_blocks, errback=self.errback_httpbin)
|
|
|
+
|
|
|
+ def parse_blocks(self, response):
|
|
|
+ for url in response.css('a[id="pageLink"]::attr(href)').getall():
|
|
|
+ yield response.follow(url=url, callback=self.parse_pages, errback=self.errback_httpbin)
|
|
|
+
|
|
|
+ def parse_pages(self, response):
|
|
|
+ for url in response.css('.news-list a::attr(href)').getall():
|
|
|
+ yield response.follow(url=url, callback=self.parse_item, errback=self.errback_httpbin)
|
|
|
+
|
|
|
+ def parse_item(self, response):
|
|
|
+ url = response.url
|
|
|
+ title = response.css('title::text').get()
|
|
|
+ source = '人民日报'
|
|
|
+ description = ''
|
|
|
+ content = ''.join(''.join(response.css('#ozoom p::text').getall()).split())
|
|
|
+ date = time.time()
|
|
|
+ column = ''
|
|
|
+ yield ElectricItem(url=url, title=title, source=source,
|
|
|
+ description=description, content=content,
|
|
|
+ date=date, column=column)
|
|
|
+
|
|
|
+ def errback_httpbin(self, failure):
|
|
|
+ self.logger.error(repr(failure))
|