Просмотр исходного кода

修改: items.py
修改: pipelines.py
修改: settings.py
修改: spiders/ewindpower.py
修改: spiders/newenergy.py
新文件: spiders/ppcc.py
修改: spiders/sungrow.py
修改: spiders/zzsolar.py
修改: ../../../timertask.py
../../../.dockerignore
../../../Dockerfile

privacy 3 лет назад
Родитель
Сommit
d8b08c2e93

+ 0 - 6
projects/electric/electric/items.py

@@ -17,9 +17,3 @@ class ElectricItem(scrapy.Item):
     date = scrapy.Field(serializer=str)
     column = scrapy.Field(serializer=str)
 
-class PatentItem(scrapy.Item):
-	nam = scrapy.Field(serializer=str)
-	num = scrapy.Field(serializer=str)
-	org = scrapy.Field(serializer=str)
-	per = scrapy.Field(serializer=str)
-	des = scrapy.Field(serializer=str)

+ 1 - 1
projects/electric/electric/pipelines.py

@@ -84,7 +84,7 @@ class PushUrlPipeline:
         return cls
 
     def process_item(self, item, spider):
-        if isinstance(item ElectricItem):
+        if isinstance(item, ElectricItem):
             adapter = ItemAdapter(item)
             data = {
                 "url": adapter['url'],

+ 2 - 2
projects/electric/electric/settings.py

@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 # Scrapy settings for electric project
-
+import os
 
 # 自动生成的配置,无需关注,不用修改
 BOT_NAME = 'electric'
@@ -111,7 +111,7 @@ HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'  #本
 
 MONGO_DB_URI = "mongodb://localhost:27017"
 MONGO_DB_NAME = "electric"
-PUSH_URI = 'http://localhost:9999/talent/insertUrlTime'
+PUSH_URI = "http://" + os.environ.get("Back_End_Ip") + ":" + os.environ.get("Back_End_Port") + "/talent/insertUrlTime"
 
 USER_AGENTS = [
     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.55",

+ 2 - 2
projects/electric/electric/spiders/ewindpower.py

@@ -35,7 +35,7 @@ class EwpcSpider(scrapy.Spider):
         title = response.css('title::text').get()
         source = '国际风能网'
         description = response.css('meta[name=description]::attr(content)').get()
-        content = response.xpath('//div[@class="content"]//text()').getall()
+        content = ''.join(response.xpath('//div[@class="content"]//text()').getall())
         date = time.time()
         column = '风能'
         self.logger.info(title)
@@ -44,4 +44,4 @@ class EwpcSpider(scrapy.Spider):
                             date=date, column=column)
 
     def errback_httpbin(self, failure):
-        self.logger.error(repr(failure))
+        self.logger.error(repr(failure))

+ 2 - 2
projects/electric/electric/spiders/newenergy.py

@@ -52,7 +52,7 @@ class NgccSpider(scrapy.Spider):
         title = response.css('title::text').get()
         source = '新能源网'
         description = ''
-        content = response.xpath('//div[@class="TRS_Editor"]//text()').getall()
+        content = ''.join(response.xpath('//div[@class="TRS_Editor"]//text()').getall())
         date = time.time()
         column = response.meta['tag']
         yield ElectricItem(url=url, title=title, source=source,
@@ -60,4 +60,4 @@ class NgccSpider(scrapy.Spider):
                             date=date, column=column)
 
     def errback_httpbin(self, failure):
-        self.logger.error(repr(failure))
+        self.logger.error(repr(failure))

+ 41 - 0
projects/electric/electric/spiders/ppcc.py

@@ -0,0 +1,41 @@
+import time
+import scrapy
+from electric.items import ElectricItem
+
+# 风能产业网
+class CweeaSpider(scrapy.Spider):
+    name = 'ppcc'
+    allowed_domains = ['paper.people.com.cn']
+    start_urls = ['http://paper.people.com.cn/rmrb/index.html',
+                  'http://paper.people.com.cn/zgnyb/paperindex.htm']
+
+    custom_settings = {
+        'DOWNLOAD_DELAY': 20,
+    }
+
+    def start_requests(self):
+        for url in self.start_urls:
+            yield scrapy.Request(url=url, callback=self.parse_blocks, errback=self.errback_httpbin)
+
+    def parse_blocks(self, response):
+        for url in response.css('a[id="pageLink"]::attr(href)').getall():
+            yield response.follow(url=url, callback=self.parse_pages, errback=self.errback_httpbin)
+
+    def parse_pages(self, response):
+        for url in response.css('.news-list a::attr(href)').getall():
+            yield response.follow(url=url, callback=self.parse_item, errback=self.errback_httpbin)
+
+    def parse_item(self, response):
+        url = response.url
+        title = response.css('title::text').get()
+        source = '人民日报'
+        description = ''
+        content = ''.join(''.join(response.css('#ozoom p::text').getall()).split())
+        date = time.time()
+        column = ''
+        yield ElectricItem(url=url, title=title, source=source,
+                            description=description, content=content,
+                            date=date, column=column)
+
+    def errback_httpbin(self, failure):
+        self.logger.error(repr(failure))

+ 2 - 2
projects/electric/electric/spiders/sungrow.py

@@ -43,7 +43,7 @@ class SungrowSpider(scrapy.Spider):
         title = response.css('title::text').get()
         source = '阳光电源股份有限公司'
         description = response.css('meta[name=description]::attr(content)').get()
-        content = response.xpath('//*[@id="content"]//p//text()').getall()
+        content = ''.join(response.xpath('//*[@id="content"]//p//text()').getall())
         date = response.meta['date']
         column = response.css('meta[name=keywords]::attr(content)').get()
         self.logger.info(title)
@@ -52,4 +52,4 @@ class SungrowSpider(scrapy.Spider):
                             date=date, column=column)
 
     def errback_httpbin(self, failure):
-        self.logger.error(repr(failure))
+        self.logger.error(repr(failure))

+ 1 - 1
projects/electric/electric/spiders/zzsolar.py

@@ -34,7 +34,7 @@ class ZzsolarSpider(scrapy.Spider):
         title = response.css('h1::text').get()
         source = '郑州国际太阳能光伏展览会'
         description = ''
-        content = response.xpath('//div[@class="showContxt"]//text()').getall()
+        content = ''.join(response.xpath('//div[@class="showContxt"]//text()').getall())
         date = time.time()
         column = '光伏'
         yield ElectricItem(url=url, title=title, source=source,

+ 2 - 2
timertask.py

@@ -17,8 +17,8 @@ from apscheduler.events import EVENT_JOB_EXECUTED, EVENT_JOB_ERROR
 
 
 logging.basicConfig(level=logging.INFO,
-	filename='timertask.log',
-	format='%(asctime)s:%(levelname)s:%(message)s'
+    filename='timertask.log',
+    format='%(asctime)s:%(levelname)s:%(message)s'
 )
 
 sched = BlockingScheduler(timezone="Asia/Shanghai")