浏览代码

first commit

privacy 3 年之前
当前提交
6f7bd5e70d
共有 42 个文件被更改,包括 2496 次插入0 次删除
  1. 0 0
      README.md
  2. 5 0
      projects/electric/electric/__init__.py
  3. 25 0
      projects/electric/electric/items.py
  4. 138 0
      projects/electric/electric/middlewares.py
  5. 99 0
      projects/electric/electric/pipelines.py
  6. 142 0
      projects/electric/electric/settings.py
  7. 5 0
      projects/electric/electric/spiders/__init__.py
  8. 133 0
      projects/electric/electric/spiders/bjx.py
  9. 75 0
      projects/electric/electric/spiders/cecn.py
  10. 50 0
      projects/electric/electric/spiders/ceec.py
  11. 52 0
      projects/electric/electric/spiders/ceeia.py
  12. 53 0
      projects/electric/electric/spiders/chinanengyuan.py
  13. 53 0
      projects/electric/electric/spiders/chinapower.py
  14. 45 0
      projects/electric/electric/spiders/chinapv.py
  15. 57 0
      projects/electric/electric/spiders/chng.py
  16. 57 0
      projects/electric/electric/spiders/cnenergynews.py
  17. 62 0
      projects/electric/electric/spiders/cnnpn.py
  18. 80 0
      projects/electric/electric/spiders/cpnn.py
  19. 53 0
      projects/electric/electric/spiders/csg.py
  20. 61 0
      projects/electric/electric/spiders/ctg.py
  21. 41 0
      projects/electric/electric/spiders/cweea.py
  22. 48 0
      projects/electric/electric/spiders/eptchina.py
  23. 45 0
      projects/electric/electric/spiders/escn.py
  24. 47 0
      projects/electric/electric/spiders/ewindpower.py
  25. 53 0
      projects/electric/electric/spiders/gxepa.py
  26. 65 0
      projects/electric/electric/spiders/iesplaza.py
  27. 50 0
      projects/electric/electric/spiders/nengyuanjie.py
  28. 63 0
      projects/electric/electric/spiders/newenergy.py
  29. 42 0
      projects/electric/electric/spiders/piec.py
  30. 56 0
      projects/electric/electric/spiders/powerchina.py
  31. 45 0
      projects/electric/electric/spiders/solarbe.py
  32. 47 0
      projects/electric/electric/spiders/solarenpv.py
  33. 55 0
      projects/electric/electric/spiders/sungrow.py
  34. 46 0
      projects/electric/electric/spiders/twea.py
  35. 51 0
      projects/electric/electric/spiders/xhhydropower.py
  36. 45 0
      projects/electric/electric/spiders/zzsolar.py
  37. 11 0
      projects/electric/scrapy.cfg
  38. 10 0
      projects/electric/setup.py
  39. 358 0
      scrapydweb_settings_v10.py
  40. 9 0
      spiders.conf
  41. 7 0
      start.sh
  42. 57 0
      timertask.py

+ 0 - 0
README.md


+ 5 - 0
projects/electric/electric/__init__.py

@@ -0,0 +1,5 @@
+# -*- coding: utf-8 -*-
+# @Author: sprivacy
+# @Date:   2022-01-26 15:38:32
+# @Last Modified by:   sprivacy
+# @Last Modified time: 2022-02-11 10:07:31

+ 25 - 0
projects/electric/electric/items.py

@@ -0,0 +1,25 @@
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class ElectricItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    url = scrapy.Field(serializer=str)
+    title = scrapy.Field(serializer=str)
+    source = scrapy.Field(serializer=str)
+    description = scrapy.Field(serializer=str)
+    content = scrapy.Field(serializer=str)
+    date = scrapy.Field(serializer=str)
+    column = scrapy.Field(serializer=str)
+
+class PatentItem(scrapy.Item):
+	nam = scrapy.Field(serializer=str)
+	num = scrapy.Field(serializer=str)
+	org = scrapy.Field(serializer=str)
+	per = scrapy.Field(serializer=str)
+	des = scrapy.Field(serializer=str)

+ 138 - 0
projects/electric/electric/middlewares.py

@@ -0,0 +1,138 @@
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+# -----user code start-------
+import time
+import random
+from scrapy.http.response.html import HtmlResponse
+# -----user code end---------
+
+from scrapy import signals
+
+# -----user code start-------
+from itemadapter import is_item, ItemAdapter
+
+from electric.settings import USER_AGENTS
+from electric.settings import PROXIES
+# -----user code end---------
+
+# useful for handling different item types with a single interface
+
+class ProxyMiddleware(object):
+    def process_requset(self, request, spider):
+        proxy = random.choice(PROXIES)
+        request.meta['proxy'] = "http://" + proxy['ip_port']
+
+class RandomUserAgentMiddleware(object):
+    def process_request(self, request, spider):
+        user_agnet = random.choice(USER_AGENTS)
+        request.headers['User-Agent'] = user_agnet
+
+
+class ElectricSpiderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, or item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Request or item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+
+
+class ElectricDownloaderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+
+        # -----user code start-------
+        print("use selenium request url {}".format(request.url))
+        # self.driver.implicitly_wait(10)
+        try:
+            spider.driver.get(request.url)
+            time.sleep(5)
+            spider.driver.execute_script('document.getElementById("J-global-toolbar").scrollIntoView()')
+            time.sleep(5)
+            self.now_page = spider.driver.page_source
+            return HtmlResponse(url=request.url, body=self.now_page, request=request, encoding='utf-8', status=200)
+        except:
+            return None
+        # -----user code end---------
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)

+ 99 - 0
projects/electric/electric/pipelines.py

@@ -0,0 +1,99 @@
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+# useful for handling different item types with a single interface
+import json
+import logging
+import requests
+from datetime import datetime
+import pymongo
+from itemadapter import ItemAdapter
+from electric.items import ElectricItem
+
+logging.basicConfig(level=logging.INFO,
+    filename='push_url_error.log',
+    format='%(asctime)s:%(levelname)s:%(message)s'
+)
+
+class ElectricPipeline:
+    def process_item(self, item, spider):
+        return item
+
+# 原始库
+class ElectricMongoDBPipeline:
+    """docstring for MongodbPipline"""
+    @classmethod
+    def from_crawler(cls, crawler):
+        cls.DB_URL = crawler.settings.get('MONGO_DB_URI', 'mongodb://localhost:27017')
+        cls.DB_NAME = crawler.settings.get('MONGO_DB_NAME', 'electric')
+        return cls()
+
+    def open_spider(self, spider):
+        self.client = pymongo.MongoClient(self.DB_URL)
+        self.db = self.client[self.DB_NAME]
+
+    def close_spider(self, spider):
+        self.client.close()
+
+    def process_item(self, item, spider):
+        if isinstance(item, ElectricItem):
+            collection = self.db["raw_news"]
+            adapter = ItemAdapter(item)
+            postItem = adapter.asdict()
+            collection.insert_one(postItem)
+            return item
+        else:
+            return item
+
+# 临时存储
+class TempMongoDBPipeline:
+    @classmethod
+    def from_crawler(cls, crawler):
+        cls.DB_URL = crawler.settings.get('MONGO_DB_URI', 'mongodb://localhost:27017')
+        cls.DB_NAME = crawler.settings.get('MONGO_DB_NAME', 'electric')
+        return cls()
+
+    def open_spider(self, spider):
+        self.client = pymongo.MongoClient(self.DB_URL)
+        self.db = self.client[self.DB_NAME]
+
+    def close_spider(self, spider):
+        self.client.close()
+
+    def process_item(self, item, spider):
+        if isinstance(item, ElectricItem):
+            collection = self.db["temp"]
+            adapter = ItemAdapter(item)
+            postItem = adapter.asdict()
+            collection.insert_one(postItem)
+            return item
+        else:
+            return item
+
+# 推送爬取记录
+class PushUrlPipeline:
+    @classmethod
+    def from_crawler(cls, crawler):
+        cls.Push_URL = crawler.settings.get('PUSH_URI', 'http://localhost:9999/talent/insertUrlTime')
+        cls.headers = {
+            'contentType':'Application/json'
+        }
+        return cls
+
+    def process_item(self, item, spider):
+        if isinstance(item ElectricItem):
+            adapter = ItemAdapter(item)
+            data = {
+                "url": adapter['url'],
+                'createTime': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
+                'name': adapter['source']
+            }
+            response = requests.post(url=url, headers=headers, json=data)
+            if josn.loads(response.text)['code'] != 200:
+                logging.error(res)
+            return item
+        else:
+            return item

+ 142 - 0
projects/electric/electric/settings.py

@@ -0,0 +1,142 @@
+# -*- coding: utf-8 -*-
+# Scrapy settings for electric project
+
+
+# 自动生成的配置,无需关注,不用修改
+BOT_NAME = 'electric'
+SPIDER_MODULES = ['electric.spiders']
+NEWSPIDER_MODULE = 'electric.spiders'
+
+
+LOG_LEVEL = "INFO"
+LOG_ENABLED = True
+
+
+FEED_EXPORT_ENCODING = 'utf-8'
+
+MIN_RANDOM_DELAY: 5
+MAX_RANDOM_DELAY: 300
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = False
+
+# 并发请求的最大值 (default: 16)
+# CONCURRENT_REQUESTS = 16
+
+# 单个网站的下载延迟 (default: 0)
+DOWNLOAD_DELAY = 10
+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
+# 下载延迟设置将只满足以下条件之一,默认启用PER_DOMAIN:
+CONCURRENT_REQUESTS_PER_DOMAIN = 1 # 单个网站的并发请求值
+# CONCURRENT_REQUESTS_PER_IP = 2 # 单个IP的并发请求值,同时DOWNLOAD_DELAY也成了相同IP两个请求间的间隔了
+
+
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = False
+
+# 禁用Telnet控制台 (enabled by default)
+TELNETCONSOLE_ENABLED = False
+
+# 覆盖默认的请求头, 这里基本上不用:
+# USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
+# DEFAULT_REQUEST_HEADERS = {
+#     'User-Agent': USER_AGENT,
+#     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#     'Accept-Language': 'en',
+# }
+
+SPLASH_URL = 'http://localhost:8050'
+
+# 爬虫中间件
+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+# SPIDER_MIDDLEWARES = {
+#     'electric.middlewares.ElectricSpiderMiddleware': 543,
+#     'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
+# }
+
+# 下载中间件
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+DOWNLOADER_MIDDLEWARES = {
+#     "electric.middlewares.RandomDelayMiddleware": 400,
+#     'electric.middlewares.ProxyMiddleware': 500,      
+    'electric.middlewares.RandomUserAgentMiddleware': 543,
+#     'scrapy_splash.SplashCookiesMiddleware': 723,
+#     'scrapy_splash.SplashMiddleware': 725,
+#     'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
+}
+
+# DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
+
+DNSCACHE_ENABLED = True
+RETRY_ENABLED = False
+
+# 扩展程序
+# See https://docs.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# 项目管道
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+    # 'electric.pipelines.ElectricPipeline': 301,
+    # 原始数据库
+    'electric.pipelines.ElectricMongoDBPipeline': 300,
+    # 临时数据库
+    'electric.pipelines.TempMongoDBPipeline': 301,
+#    'electric.pipelines.PushUrlPipeline': 400,
+}
+
+# 启用并配置自动限速扩展 (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
+AUTOTHROTTLE_ENABLED = True
+# 初始下载延迟
+AUTOTHROTTLE_START_DELAY = 5
+# 在高延迟下的最大延迟
+AUTOTHROTTLE_MAX_DELAY = 300
+# Scrapy应向每个远程服务器并行发送的平均请求数
+AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# 是否启用显示收到的每个响应的限制统计信息:
+AUTOTHROTTLE_DEBUG = False
+
+# HTTP缓存
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+HTTPCACHE_ENABLED = True
+HTTPCACHE_EXPIRATION_SECS = 0 # 有效时长
+HTTPCACHE_DIR = 'httpcache'  # 存储路径 
+HTTPCACHE_IGNORE_HTTP_CODES = [400,402,403,404,500,501,502,504,520]  # 忽略请求
+HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'  #本地存储 
+# HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage' # Splash的http缓存
+# HTTPCACHE_GZIP = False # 压缩格式
+
+MONGO_DB_URI = "mongodb://localhost:27017"
+MONGO_DB_NAME = "electric"
+PUSH_URI = 'http://localhost:9999/talent/insertUrlTime'
+
+USER_AGENTS = [
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.55",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0",
+    # "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36",
+    # "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
+    # "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
+    # "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
+    # "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
+    # "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
+    # "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
+    # "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
+    # "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
+    # "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
+    # "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36",
+    # "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
+    # "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
+    # "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
+    # "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
+    # "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
+    # "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
+    # "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
+]
+
+PROXIES = [
+    {'ip_port': ''},
+]

+ 5 - 0
projects/electric/electric/spiders/__init__.py

@@ -0,0 +1,5 @@
+# -*- coding: utf-8 -*-
+# @Author: sprivacy
+# @Date:   2022-02-11 10:08:53
+# @Last Modified by:   sprivacy
+# @Last Modified time: 2022-02-11 10:08:53

+ 133 - 0
projects/electric/electric/spiders/bjx.py

@@ -0,0 +1,133 @@
+import time
+
+import scrapy
+from electric.items import ElectricItem
+# from scrapy import Request
+# import scrapy_splash
+# from scrapy_splash import SplashRequest
+# from scrapy.shell import inspect_response
+
+
+# 北极星
+class BjxSpider(scrapy.Spider):
+    name = 'bjx'
+    download_delay = 10
+    allowed_domains = ['bjx.com.cn']
+    start_urls = [
+        'https://www.bjx.com.cn',
+    ]
+
+    def start_requests(self):
+        for url in self.start_urls:
+            yield scrapy.Request(url=url, callback=self.parse_class, errback=self.errback_httpbin, dont_filter=True)
+            # yield SplashRequest(url=url, endpoint='render.html',
+                                # args={"wait": 5, 'timeout': 90}, callback=self.parse_pages)
+
+    # 获取子站
+    def parse_class(self, response):
+        yield response.follow(url=response.css('a[title="风力发电"]::attr(href)').get(), callback=self.parse_fengdian, errback=self.errback_httpbin, meta={"column": '风电'}, dont_filter=True)
+        yield response.follow(url=response.css('a[title="光伏太阳能"]::attr(href)').get(), callback=self.parse_fengdian, errback=self.errback_httpbin, meta={"column": '光伏'}, dont_filter=True)
+        yield response.follow(url=response.css('a[title="储能"]::attr(href)').get(), callback=self.parse_chuneng, errback=self.errback_httpbin, meta={"column": '储能'}, dont_filter=True)
+        yield response.follow(url=response.css('a[title="综合能源服务"]::attr(href)').get(), callback=self.parse_zhihui, errback=self.errback_httpbin, meta={"column": '智慧能源'}, dont_filter=True)
+        yield response.follow(url=response.css('a[title="核电"]::attr(href)').get(), callback=self.parse_fengdian, errback=self.errback_httpbin, meta={"column": '核电'}, dont_filter=True)
+        yield response.follow(url=response.css('a[title="电力软件"]::attr(href)').get(), callback=self.parse_software, errback=self.errback_httpbin, meta={"column": '计算机'}, dont_filter=True)
+        yield response.follow(url=response.css('a[title="智能电网"]::attr(href)').get(), callback=self.parse_dianwang, errback=self.errback_httpbin, meta={"column": '智慧能源'}, dont_filter=True)
+        yield response.follow(url=response.css('a[title="氢能"]::attr(href)').get(), callback=self.parse_fengdian, errback=self.errback_httpbin, meta={"column": '氢能'}, dont_filter=True)
+        yield response.follow(url=response.css('a[title="分布式能源"]::attr(href)').get(), callback=self.parse_tanzichan, errback=self.errback_httpbin, meta={"column": '碳资产'}, dont_filter=True)
+        yield response.follow(url=response.css('a[title="自动化"]::attr(href)').get(), callback=self.parse_auto, errback=self.errback_httpbin, meta={"column": '计算机'}, dont_filter=True)
+        yield response.follow(url=response.css('a[title="电力通信"]::attr(href)').get(), callback=self.parse_msg, errback=self.errback_httpbin, meta={"column": '计算机'}, dont_filter=True)
+
+
+    def parse_fengdian(self, response):
+        for url in response.css(".left div .cc-section small a::attr(href)").getall():
+            yield response.follow(url=url, callback=self.parse_pages_fengdian, errback=self.errback_httpbin, meta={"column": response.meta['column']}, dont_filter=True)
+
+    def parse_chuneng(self, response):
+        for url in response.css('.center .active a::attr(href)').getall():
+            yield response.follow(url=url, callback=self.parse_pages_fengdian, errback=self.errback_httpbin, meta={"column": response.meta['column']}, dont_filter=True)
+
+    def parse_zhihui(self, response):
+        for url in response.css('.titled a::attr(href)').getall():
+            yield response.follow(url=url, callback=self.parse_pages_zhihui, errback=self.errback_httpbin, meta={"column": response.meta['column']}, dont_filter=True)
+
+    def parse_software(self, response):
+        for url in response.css('h3 span a::attr(href)').getall():
+            yield response.follow(url=url, callback=self.parse_pages_software, errback=self.errback_httpbin, meta={"column": response.meta['column']}, dont_filter=True)
+
+    def parse_dianwang(self, response):
+        for url in response.css('#menuItem_yw a::attr(href)').getall():
+            yield response.follow(url=url, callback=self.parse_pages_software, errback=self.errback_httpbin, meta={"column": response.meta['column']}, dont_filter=True)
+
+    def parse_tanzichan(self, response):
+        for url in response.css('.with-top-news a.more::attr(href)').getall():
+            yield response.follow(url=url, callback=self.parse_pages_tanzichan, errback=self.errback_httpbin, meta={"column": response.meta['column']}, dont_filter=True)
+
+    def parse_auto(self, response):
+        for url in response.css('.main2_leftbox span  a::attr(href)').getall():
+            yield response.follow(url=url, callback=self.parse_pages_software, errback=self.errback_httpbin, meta={"column": response.meta['column']}, dont_filter=True)
+
+    def parse_msg(self, response):
+        for url in response.css('.cc-list-content a::attr(href)').getall():
+            yield response.follow(url=url, callback=self.parse_items_fengdian, errback=self.errback_httpbin, meta={"column": response.meta['column']}, dont_filter=True)
+
+    def parse_pages_fengdian(self, response):
+        for url in response.css(".cc-list-content a::attr(href)").getall():
+            yield response.follow(url=url, callback=self.parse_items_fengdian, errback=self.errback_httpbin, meta={"column": response.meta['column']})
+        try:
+            nextpage = response.css(".cc-paging a").re('<a href="(.*?)".*>下一页')[0]
+            yield response.follow(url=nextpage, callback=self.parse_pages_fengdian, errback=self.errback_httpbin, meta={"column": response.meta['column']})
+        except Exception as e:
+            self.logger.info(e)
+
+    def parse_pages_zhihui(self, response):
+        for url in response.css(".top a::attr(href)").getall():
+            yield response.follow(url=url, callback=self.parse_items_fengdian, errback=self.errback_httpbin, meta={"column": response.meta['column']})
+
+    def parse_pages_software(self, response):
+        for url in response.css('.list_left_ul a::attr(href)').getall():
+            yield response.follow(url=url, callback=self.parse_items_fengdian, errback=self.errback_httpbin, meta={"column": response.meta['column']})
+        try:
+            nextpage = response.css(".page a[title]::attr(href)").get()
+            yield response.follow(url=nextpage, callback=self.parse_pages_software, errback=self.errback_httpbin, meta={"column": response.meta['column']})
+        except Exception as e:
+            self.logger.info(e)
+
+    def parse_pages_tanzichan(self, response):
+        for url in response.css('.news-list-ul a::attr(href)').getall():
+            yield response.follow(url=url, callback=self.parse_items_fengdian, errback=self.errback_httpbin, meta={"column": response.meta['column']}, dont_filter=True)
+
+
+
+    def parse_items_fengdian(self, response):
+        url = response.url
+        title = response.css('title::text').get()
+        source = '北极星电力软件网'
+        description = response.css('meta[name=Description]::attr(content)').get()
+        content = "".join(response.css(".cc-article p::text").getall())
+        date = time.time()
+        column = response.meta['column']
+        yield ElectricItem(url=url, title=title, source=source,
+                            description=description, content=content,
+                            date=date, column=column)
+
+    def parse_items_zhihui(self, response):
+        url = response.url
+        title = response.css('title::text').get()
+        source = '北极星'
+        description = response.css('meta[name=Description]::attr(content)').get()
+        try:
+            content = "".join(response.css(".list_detail > p::text").getall())
+        except:
+            try:
+                content = "".join(response.css('div.newsrand p::text').getall())
+            except:
+                content = ''
+        date = time.time()
+        column = response.meta['column']
+        yield ElectricItem(url=url, title=title, source=source,
+                            description=description, content=content,
+                            date=date, column=column)
+
+
+    def errback_httpbin(self, failure):
+        self.logger.error(repr(failure))

+ 75 - 0
projects/electric/electric/spiders/cecn.py

@@ -0,0 +1,75 @@
+import re
+import time
+import scrapy
+from electric.items import ElectricItem
+# from scrapy import Request
+# import scrapy_splash
+# from scrapy_splash import SplashRequest
+
+
+# 建设工程造价信息网
+class CecnSpider(scrapy.Spider):
+    name = 'cecn'
+    download_delay = 10
+    allowed_domains = ['cecn.org.cn']
+    start_urls = [
+    			'http://www.cecn.org.cn/NewList.asp?tid=1&searchstring=&ThisPage=1',
+                'http://www.cecn.org.cn/NewList.asp?tid=5&searchstring=&ThisPage=1',
+                'http://www.cecn.org.cn/NewList.asp?tid=13&searchstring=&ThisPage=1',
+                'http://www.cecn.org.cn/NewList.asp?tid=15&searchstring=&ThisPage=1',
+                'http://www.cecn.org.cn/NewList.asp?tid=19&searchstring=&ThisPage=1',
+                'http://www.cecn.org.cn/NewList.asp?tid=21&searchstring=&ThisPage=1',
+                'http://www.cecn.org.cn/NewList.asp?tid=28&searchstring=&ThisPage=1',
+                'http://www.cecn.org.cn/NewList.asp?tid=29&searchstring=&ThisPage=1',
+                'http://www.cecn.org.cn/NewList.asp?tid=30&searchstring=&ThisPage=1',
+                'http://www.cecn.org.cn/NewList.asp?tid=31&searchstring=&ThisPage=1',
+                'http://www.cecn.org.cn/NewList.asp?tid=40&searchstring=&ThisPage=1',
+                'http://www.cecn.org.cn/NewList.asp?tid=41&searchstring=&ThisPage=1',
+                'http://www.cecn.org.cn/NewList.asp?tid=42&searchstring=&ThisPage=1',
+                'http://www.cecn.org.cn/NewList.asp?tid=43&searchstring=&ThisPage=1',
+                'http://www.cecn.org.cn/NewList.asp?tid=44&searchstring=&ThisPage=1',
+                'http://www.cecn.org.cn/NewList.asp?tid=45&searchstring=&ThisPage=1',
+                'http://www.cecn.org.cn/NewList.asp?tid=46&searchstring=&ThisPage=1',
+            ]
+
+    def start_requests(self):
+        for url in self.start_urls:
+            yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin, meta={"pagenum": 1}, dont_filter=True)
+            # yield SplashRequest(url=url, endpoint='render.html',
+            #                     args={"wait": 5, 'timeout': 90}, callback=self.parse_pages,
+            #                     meta={"pagenum": 1})
+
+    def parse_pages(self, response):
+        for url in response.css('ul.mew_list li a::attr(href)').getall():
+            self.logger.info('item url: {}'.format(url))
+            yield response.follow(url=url, callback=self.parse_items, errback=self.errback_httpbin, meta={"pagenum": None})
+            # yield SplashRequest(url=url, endpoint='render.html',
+            #                     args={"wait": 5, 'timeout': 90}, callback=self.parse_items,
+            #                     meta={"date": date, "pagenum": None})
+        try:
+            if response.meta['pagenum'] == 1:
+                nextpage = response.css('div.l_cont td > a:nth-child(1)::attr(href)').get()
+            else:
+                nextpage = response.css('div.l_cont td > a:nth-child(2)::attr(href)').get()
+            self.logger.info('next page: {}'.format(nextpage))
+            yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin, meta={"pagenum": None})
+        except Exception as e:
+            self.logger.info(e)
+        # yield SplashRequest(url=url, endpoint='render.html',
+        #                         args={"wait": 5, 'timeout': 90}, callback=self.parse_pages, meta={"pagenum": None})
+
+    def parse_items(self, response):
+        url = response.url
+        title = response.css('title::text').get()
+        source = '建设工程造价信息网'
+        description = ''
+        content = "".join(re.sub(r'<.*?>', '', ''.join(response.css('#divBody').getall())).split())
+        date = time.time()
+        column = "建设工程造价信息"
+        self.logger.info(url)
+        yield ElectricItem(url=url, title=title, source=source,
+                            description=description, content=content,
+                            date=date, column=column)
+
+    def errback_httpbin(self, failure):
+        self.logger.error(repr(failure))

+ 50 - 0
projects/electric/electric/spiders/ceec.py

@@ -0,0 +1,50 @@
+import time
+import scrapy
+from electric.items import ElectricItem
+
+
+# 中国能源建设股份有限公司
+# 员工风采
+class CeecSpider(scrapy.Spider):
+    name = 'ceec'
+    # download_delay = 20
+    allowed_domains = ['ceec.net.cn']
+    start_urls = ['http://www.ceec.net.cn/col/col11016/index.html']
+
+    def start_requests(self):
+        # for url in self.start_urls:
+        #     self.logger.info('next page: {}'.format(url))
+        #     yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin)
+        yield scrapy.Request(url=self.start_urls[0], callback=self.parse_pages, errback=self.errback_httpbin, dont_filter=True)
+        for inx in range(2, 78):
+            url = 'http://www.ceec.net.cn/col/col11016/index.html?uid=410804&pageNum={}'.format(inx)
+            self.logger.info('next page: {}'.format(url))
+            yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin)
+
+    def parse_pages(self, response):
+        # for item in response.xpath('//*[@id="410804"]//ul//li'):
+        #     url = item.css('div.lanmu-txt > p > a').attrib['href']
+        #     date = item.css('div.lanmu-time > p > span::text').get()
+        #     self.logger.info('parse item: {}'.format(url))
+        #     yield response.follow(url=url, callback=self.parse_items, errback=self.errback_httpbin, meta={"date": date})
+        # nextpage = response.css('a[title=下一页]').attrib['href']
+        # self.logger.info('next page: {}'.format(nextpage))
+        # yield response.follow(url=nextpage, callback=self.parse_items, errback=self.errback_httpbin)
+        for url in response.xpath('//*[@id="410804"]/script/text()').re(r'href=\"(http://www.ceec.net.cn/art.*?)\"'):
+            yield response.follow(url=url, callback=self.parse_items, errback=self.errback_httpbin)
+
+    def parse_items(self, response):
+        url = response.url
+        title = response.css('title::text').get()
+        source = '中国能源建设股份有限公司'
+        description = response.css('meta[name=description]::attr(content)').get()
+        content = ''.join(response.xpath('//div[@class="wz_article"]//p//text()').getall())
+        date = time.time()
+        column = ''
+        self.logger.info(title)
+        yield ElectricItem(url=url, title=title, source=source,
+                            description=description, content=content,
+                            date=date, column=column)
+
+    def errback_httpbin(self, failure):
+        self.logger.error(repr(failure))

+ 52 - 0
projects/electric/electric/spiders/ceeia.py

@@ -0,0 +1,52 @@
+import time
+import scrapy
+from electric.items import ElectricItem
+
+# 中国电器工业协会
+class CeeiaSpider(scrapy.Spider):
+    name = 'ceeia'
+    allowed_domains = ['ceeia.com']
+    start_urls = [
+        ('http://www.ceeia.com/XHDT/{}.html', 49),    # 协会动态
+        ('http://www.ceeia.com/HYZL/{}.html', 1206),  # 行业纵览
+        ('http://www.ceeia.com/JSQY/{}.html', 347),   # 技术前沿
+    ]
+
+    custom_settings = {
+        'DOWNLOAD_DELAY': 10,
+    }
+
+    def start_requests(self):
+        for baseurl, maxpage in self.start_urls:
+            url = baseurl.format(1)
+            yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin, meta={"baseurl": baseurl, "nextpage": 2, "maxpage": maxpage})
+
+    def parse_pages(self, response):
+        for url in response.css(".u-name a::attr(href)").getall():
+            yield response.follow(url=url, callback=self.parse_items, errback=self.errback_httpbin)
+        if response.meta['nextpage'] <= response.meta['maxpage']:
+            nextpage = response.meta['baseurl'].format(response.meta['nextpage'])
+            self.logger.info('next page: {}'.format(nextpage))
+            yield response.follow(url=nextpage, 
+                callback=self.parse_pages, 
+                errback=self.errback_httpbin, 
+                meta={"baseurl": response.meta['baseurl'], 
+                    'nextpage': response.meta['nextpage'] + 1, 
+                    'maxpage': response.meta['maxpage']
+                })
+
+    def parse_items(self, response):
+        url = response.url
+        title = response.xpath('//div[@class="n-title"]/text()').get()
+        source = '中国电器工业协会'
+        description = ''
+        content = ''.join(''.join(response.xpath('//div[@class="n-content"]//text()').getall()).split())
+        date = time.time()
+        column = ''
+        self.logger.info('title: {}'.format(title))
+        yield ElectricItem(url=url, title=title, source=source,
+                            description=description, content=content,
+                            date=date, column=column)
+
+    def errback_httpbin(self, failure):
+        self.logger.error(repr(failure))

+ 53 - 0
projects/electric/electric/spiders/chinanengyuan.py

@@ -0,0 +1,53 @@
+import re
+import time
+import scrapy
+from electric.items import ElectricItem
+
+# 新能源网
+class CnySpider(scrapy.Spider):
+    name = 'cny'
+    allowed_domains = ['china-nengyuan.com']
+    start_urls = [
+        'http://www.china-nengyuan.com/news/news_list.php?keyword=太阳能',
+        'http://www.china-nengyuan.com/news/news_list.php?keyword=风能',
+        'http://www.china-nengyuan.com/news/news_list.php?keyword=氢能',
+        'http://www.china-nengyuan.com/news/news_list.php?keyword=储能',
+        'http://www.china-nengyuan.com/news/news_list.php?keyword=新材料',
+    ]
+
+    custom_settings = {
+        'DOWNLOAD_DELAY': 20,
+        'ITEM_PIPELINES': {
+            'electric.pipelines.PatentMongoDBPipeline': 300,
+        }
+    }
+
+    def start_requests(self):
+        for url in self.start_urls:
+            yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin, dont_filter=True)
+
+
+    def parse_pages(self, response):
+        for url in response.css('.member_tr_row a.blue::attr(href)').getall():
+            yield response.follow(url=url, callback=self.parse_items, errback=self.errback_httpbin)
+        try:
+            nextpage = response.css('table.membertable_page:nth-of-type(3) a:nth-of-type(1)::attr(href)').get()
+            yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin)
+        except:
+            self.logger.info('Last page')
+
+    def parse_items(self, response):
+        url = response.url
+        title = response.css("title::text").get()
+        source = '新能源网'
+        description = ''
+        content =  "".join("".join(response.css(".f16 p::text").getall()).split())
+        date = time.time()
+        column = ''
+        self.logger.info('title: {}'.format(title))
+        yield ElectricItem(url=url, title=title, source=source,
+                            description=description, content=content,
+                            date=date, column=column)
+            
+    def errback_httpbin(self, failure):
+        self.logger.error(repr(failure))

+ 53 - 0
projects/electric/electric/spiders/chinapower.py

@@ -0,0 +1,53 @@
+# !/usr/bin/python
+# -*- coding: utf-8 -*-
+# @Author: sprivacy
+# @Date:   2022-05-13 16:54:47
+# @Last Modified by:   sprivacy
+# @Last Modified time: 2022-05-13 18:05:04
+# 中国电力
+
+import re
+import time
+import scrapy
+from electric.items import ElectricItem
+
+class BccnSpider(scrapy.Spider):
+    name = 'chinapower'
+    download_delay = 20
+    allowed_domains = ['chinapower.com.cn']
+    start_urls = [
+                'http://b2b.chinapower.com.cn/news/',
+            ]
+
+    def start_requests(self):
+        for url in self.start_urls:
+            yield scrapy.Request(url=url, callback=self.parse_class, errback=self.errback_httpbin, dont_filter=True)
+
+    def  parse_class(self, response):
+        for classes in response.css(".head-txt a::attr(href)").getall():
+            yield response.follow(url=url, callback=self.parse_pages, errback=self.errback_httpbin, dont_filter=True)
+
+    def parse_pages(self, response):
+        for item in response.xpath('//div[@class="catlist"]/ul//li[not(@class="sp")]'):
+            url = item.css('a').attrib['href']
+            self.logger.info('parse item: {}'.format(url))
+            yield response.follow(url=url, callback=self.parse_items, errback=self.errback_httpbin)
+        nextpage = response.css("div.pages a::attr(href)").getall()[-1]
+        self.logger.info('next page: {}'.format(nextpage))
+        yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin, dont_filter=True)
+
+    def parse_items(self, response):
+        url = response.url
+        title = response.css('#title::text').get()
+        source = '中国电力'
+        description = response.css('meta[name=description]::attr(content)').get()
+        content = "".join(re.sub(r'<.*?>', '', response.css('div.content').get()).split())
+        date = time.time()
+        column = ''
+        self.logger.info(title)
+        yield ElectricItem(url=url, title=title, source=source,
+                            description=description, content=content,
+                            date=date, column=column)
+
+    def errback_httpbin(self, failure):
+        self.logger.error(repr(failure))

+ 45 - 0
projects/electric/electric/spiders/chinapv.py

@@ -0,0 +1,45 @@
+import time
+import scrapy
+from electric.items import ElectricItem
+
+# 中国光伏行业协会
+class CpvSpider(scrapy.Spider):
+    name = 'chinapv'
+    allowed_domains = ['chinapv.org.cn']
+    start_urls = ['http://www.chinapv.org.cn/association_news.html']
+
+    custom_settings = {
+        'DOWNLOAD_DELAY': 10,
+    }
+    
+    def start_requests(self):
+        for url in self.start_urls:
+            yield scrapy.Request(url=url,
+                callback=self.parse_pages,
+                errback=self.errback_httpbin,
+                headers={"Host":'www.chinapv.org.cn',"Refer":'http://www.chinapv.org.cn/association_news.html'})
+
+    def parse_pages(self, response):
+        for url in response.css('.text > a::attr(href)').getall():
+            yield response.follow(url=url, callback=self.parse_item, errback=self.errback_httpbin)
+        try:
+            nextpage = response.css('a.next::attr(href)').get()
+            yield response.follow(url=url, callback=self.parse_pages, errback=self.errback_httpbin)
+        except:
+            self.logger.info('Last page')
+
+    def parse_item(self, response):
+        url = response.url
+        title = response.xpath('//div[@class="title"]/h1/text()').get()
+        source = '中国光伏行业协会'
+        description = ''
+        content = ''.join(''.join(response.xpath('//div[@class="industry_mapxx1"]/div[@class="cont"]/p//text()').getall()).split())
+        date = time.time()
+        column = '光伏'
+        self.logger.info('title: {}'.format(title))
+        yield ElectricItem(url=url, title=title, source=source,
+                            description=description, content=content,
+                            date=date, column=column)
+
+    def errback_httpbin(self, failure):
+        self.logger.error(repr(failure))

+ 57 - 0
projects/electric/electric/spiders/chng.py

@@ -0,0 +1,57 @@
+import time
+import scrapy
+from electric.items import ElectricItem
+
+
+# 中国华能集团公司
+# 集团要闻、领导活动、一线新闻、媒体报道、国资网讯
+class ChngSpider(scrapy.Spider):
+    name = 'chng'
+    download_delay = 14
+    allowed_domains = ['chng.com.cn']
+    start_urls = [
+        'https://www.chng.com.cn/list_ldhd/-/article/cWFTITGYzrws/list/23209.html',
+        'https://www.chng.com.cn/list_yxxw/-/article/7tNnqgwRLpoA/list/23219.html',
+        'https://www.chng.com.cn/list_mtbd/-/article/whVXc9vlCPOV/list/23224.html',
+        'https://www.chng.com.cn/list_gzwx/-/article/Ze8xCBHM8In0/list/23229.html',
+
+    ]
+
+    def start_requests(self):
+        for url in ['https://www.chng.com.cn/list_jtyw/-/article/vUfd4jOBhajJ/list/23204.html']:
+            yield scrapy.Request(url=url, callback=self.parse_pages1, errback=self.errback_httpbin, dont_filter=True)
+        for url in self.start_urls:
+            yield scrapy.Request(url=url, callback=self.parse_pages2, errback=self.errback_httpbin, dont_filter=True)
+
+    def parse_pages1(self, response):
+        for item in response.css('div.impNews-content div'):
+            urls = item.css('div.atvimg::attr(onclick)').re(r'javascript:window.open\(\'(.*?)\'\)')
+            for url in urls:
+                self.logger.info('parse item: {}'.format(url))
+                yield response.follow(url=url, callback=self.parse_items, errback=self.errback_httpbin)
+
+    def parse_pages2(self, response):
+        for item in response.css('div.leaderShip-content div.news-list-item'):
+            url = item.css('div.news-list-con::attr(onclick)').re(r'javascript:window.open\(\'(.*?)\'\)')[0]
+            self.logger.info('parse item: {}'.format(url))
+            yield response.follow(url=url, callback=self.parse_items, errback=self.errback_httpbin)
+        nextpage = response.css('a.layui-laypage-next').attrib['href']
+        self.logger.info('next page: {}'.format(nextpage))
+        if nextpage:
+            yield response.follow(url=nextpage, callback=self.parse_pages2, errback=self.errback_httpbin, dont_filter=True)
+
+    def parse_items(self, response):
+        url = response.url
+        title = response.css('title::text').get()
+        source = '中国华能集团公司'
+        description = ''
+        content = ''.join("".join(response.xpath('//div[@class="detail-article"]//text()').getall()).split())
+        date = time.time()
+        column = ''
+        self.logger.info(title)
+        yield ElectricItem(url=url, title=title, source=source,
+                            description=description, content=content,
+                            date=date, column=column)
+
+    def errback_httpbin(self, failure):
+        self.logger.error(repr(failure))

+ 57 - 0
projects/electric/electric/spiders/cnenergynews.py

@@ -0,0 +1,57 @@
+import time
+import datetime
+import scrapy
+from electric.items import ElectricItem
+
+
+# 中国能源网
+# 地方、电网、电力、风电、光伏、储能、氢能、环保、访谈、科技装备、新能源汽车、能源互联
+class CnenSpider(scrapy.Spider):
+    name = 'cnen'
+    download_delay = 20
+    allowed_domains = ['cnenergynews.cn']
+    baseurls = [
+        'http://cnenergynews.cn/js/88/mi4_sub_articles_{}.js?v=20220221104644',
+        'http://cnenergynews.cn/js/670/mi4_sub_articles_{}.js?v=20220216162004',
+        'http://cnenergynews.cn/js/81/mi4_sub_articles_{}.js?v=20220217094205',
+        'http://cnenergynews.cn/js/94/mi4_sub_articles_{}.js?v=20220215141813',
+        'http://cnenergynews.cn/js/82/mi4_sub_articles_{}.js?v=20220217094755',
+        'http://cnenergynews.cn/js/95/mi4_sub_articles_{}.js?v=20220217094454',
+        'http://cnenergynews.cn/js/96/mi4_sub_articles_{}.js?v=20220216145517',
+        'http://cnenergynews.cn/js/85/mi4_sub_articles_{}.js?v=20220217101542',
+        'http://cnenergynews.cn/js/84/mi4_sub_articles_{}.js?v=20220207111433',
+        'http://cnenergynews.cn/js/450/mi4_sub_articles_{}.js?v=20220212161250',
+        'http://cnenergynews.cn/js/98/mi4_sub_articles_{}.js?v=20220217093835',
+        'http://cnenergynews.cn/js/112/mi4_sub_articles_{}.js?v=20220207111433',
+    ]
+
+    custom_settings = {
+        'CONCURRENT_REQUESTS_PER_IP': 1,
+        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
+        'DOWNLOAD_TIMEOUT': 2,
+    }
+
+    def start_requests(self):
+        for index in range(1, 300):
+            for baseurl in self.baseurls:
+                format_time = (datetime.datetime.now()+datetime.timedelta(days=-index)).strftime("%Y%m%d")
+                url = baseurl.format(format_time)
+                self.logger.info('next url: {}'.format(url))
+                yield scrapy.Request(url=url, callback=self.parse_items, errback=self.errback_httpbin)
+
+    def parse_items(self, response):
+        for item in eval(response.text[65:]):
+            url = item['url']
+            title = item['title']
+            source = '中国能源网'
+            description = item['miSummary']
+            content = item['miContentTxt']
+            date = time.time()
+            column = ''
+            self.logger.info(title)
+            yield ElectricItem(url=url, title=title, source=source,
+                            description=description, content=content,
+                            date=date, column=column)
+
+    def errback_httpbin(self, failure):
+        self.logger.error(repr(failure))

+ 62 - 0
projects/electric/electric/spiders/cnnpn.py

@@ -0,0 +1,62 @@
+import time
+import re
+import scrapy
+# from scrapy import Request
+# import scrapy_splash
+# from scrapy_splash import SplashRequest
+
+from electric.items import ElectricItem
+
+# from scrapy.shell import inspect_response
+# from scrapy.utils.response import open_in_browser
+
+# 中国核电网
+class CnnpnSpider(scrapy.Spider):
+    name = 'cnnpn'
+    download_delay = 16
+    allowed_domains = ['cnnpn.cn']
+    start_urls = [
+        'https://www.cnnpn.cn/channel/1.html',
+        'https://www.cnnpn.cn/channel/3.html',
+        'https://www.cnnpn.cn/channel/4.html',
+    ]
+
+    def start_requests(self):
+        for url in self.start_urls:
+            yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin)
+            # yield SplashRequest(url=url, endpoint='render.html',
+            #                     args={"wait": 5, 'timeout': 90}, callback=self.parse_pages)
+
+    def parse_pages(self, response):
+        for url in response.css('#news-lists li a::attr(href)').getall():
+            self.logger.info('item url: {}'.format(url))
+            yield response.follow(url=url, callback=self.parse_items, errback=self.errback_httpbin)
+            # yield SplashRequest(url=url, endpoint="render.html",
+            #                     args={"wait": 5, 'timeout': 90}, callback=self.parse_items,
+            #                     meta={"date": date})
+        try:
+            url = response.css('div.pagecode a:nth-last-child(1)::attr(href)').get()
+            self.logger.info('next page: {}'.format(url))
+            yield response.follow(url=url, callback=self.parse_pages, errback=self.errback_httpbin)
+        except:
+            self.logger.info('Last page')
+        # yield SplashRequest(url=url, endpoint='render.html',
+        #                     args={"wait": 5, 'timeout': 90}, callback=self.parse_pages)
+
+    def parse_items(self, response):
+        url = response.url
+        source = '中国核电网'
+        description = ''
+        date = time.time()
+        column = '核电'
+        try:
+            title = response.css('title::text').get()
+            content = "".join(re.sub(r"<.*?>", "", response.css("div.content").get()).split())
+            yield ElectricItem(url=url, title=title, source=source,
+                                description=description, content=content,
+                                date=date, column=column)
+        except Exception as e:
+            self.logger.warning(e)
+
+    def errback_httpbin(self, failure):
+        self.logger.error(repr(failure))

+ 80 - 0
projects/electric/electric/spiders/cpnn.py

@@ -0,0 +1,80 @@
+import time
+import re
+import scrapy
+# from scrapy_splash import SplashRequest
+
+from electric.items import ElectricItem
+
+
+# 中国电力报
+# 科技、网上国网、基建、人物
+class CpnnSpider(scrapy.Spider):
+    name = 'cpnn'
+    download_delay = 15
+    allowed_domains = ['cpnn.com.cn']
+    start_urls = [
+        'http://www.cpnn.com.cn/news/nytt/',
+        'http://www.cpnn.com.cn/news/nyjg/',
+        'http://www.cpnn.com.cn/news/nyqy/',
+        'http://www.cpnn.com.cn/news/dfny/',
+        'http://www.cpnn.com.cn/news/hg/',
+        'http://www.cpnn.com.cn/news/hy/',
+        'http://www.cpnn.com.cn/news/kj/',
+        'http://www.cpnn.com.cn/news/nygm/',
+        'http://www.cpnn.com.cn/news/tdftzh/',
+        'http://www.cpnn.com.cn/news/xny/',
+        'http://www.cpnn.com.cn/news/xxdlxt/',
+        'http://www.cpnn.com.cn/dianli/dangjian/',
+        'http://www.cpnn.com.cn/shouye/sylm/wsgw/',
+        'http://www.cpnn.com.cn/qiye/shizheng/',
+        'http://www.cpnn.com.cn/qiye/yingcai/',
+        'http://www.cpnn.com.cn/qiye/rongyu/',
+        
+    ]
+
+    def start_requests(self):
+        for url in self.start_urls:
+            yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin, dont_filter=True)
+        for url in ['http://www.cpnn.com.cn/dianli/dljs/dljsyw/',]:
+            yield scrapy.Request(url=url, callback=self.parse_pages1, errback=self.errback_httpbin, dont_filter=True)
+
+    def parse_pages(self, response):
+        for url in response.css("div.cpnnlist div.fl ul li a::attr(href)").getall():
+            yield response.follow(url=url, dont_filter=False, callback=self.parse_items, errback=self.errback_httpbin)
+        nextpage = response.css('div.page a').re(r'href="(.*?)">下一页')[0]
+        self.logger.info('next page: {}'.format(nextpage))
+        yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin)
+
+    def parse_pages1(self, response):
+        for url in response.css('a.fl::attr(href)').getall():
+            yield response.follow(url=url, dont_filter=False, callback=self.parse_items1, errback=self.errback_httpbin)
+        nextpage = response.css('div.page a').re(r'href="(.*?)">下一页')[0]
+        self.logger.info('next page: {}'.format(nextpage))
+        yield response.follow(url=nextpage, callback=self.parse_pages1, errback=self.errback_httpbin)
+
+    def parse_items1(self, response):
+        url = response.url
+        title = response.css("h3::text").get()
+        source = '中国电力报'
+        description = ''
+        content = "".join(re.sub(r"<.*?>", "", "".join(response.css(".detail-box p:nth-of-type(n+2)").getall())).split())
+        date = time.time()
+        column = ''
+        yield ElectricItem(url=url, title=title, source=source,
+                            description=description, content=content,
+                            date=date, column=column)
+
+    def parse_items(self, response):
+        url = response.url
+        title = response.css("h3::text").get()
+        source = '中国电力报'
+        description = response.css('meta[name=Description]::attr(content)').get()
+        content = ''.join(''.join(response.css("div.cpnnlist_l p::text").getall()).split())
+        date = time.time()
+        column = ''
+        yield ElectricItem(url=url, title=title, source=source,
+                            description=description, content=content,
+                            date=date, column=column)
+
+    def errback_httpbin(self, failure):
+        self.logger.error(repr(failure))

+ 53 - 0
projects/electric/electric/spiders/csg.py

@@ -0,0 +1,53 @@
+from datetime import datetime
+import time
+import scrapy
+from electric.items import ElectricItem
+
+
+# 南方电网报
+# 公司要闻、高管动态、一线传真、图说南网、媒体关注
+class CsgSpider(scrapy.Spider):
+    name = 'csg'
+    download_delay = 15
+    allowed_domains = ['csg.cn']
+    start_urls = [
+        'https://www.csg.cn/xwzx/{}/gsyw/',
+        'https://www.csg.cn/xwzx/{}/gcdt/',
+        'https://www.csg.cn/xwzx/{}/yxcz/',
+        'https://www.csg.cn/xwzx/{}/tsnw/',
+        'https://www.csg.cn/xwzx/{}/mtgz/',
+    ]
+
+    def start_requests(self):
+        for url in self.start_urls:
+            for year in range(datetime.now().year, 2006, -1):
+                url = url.format(year)
+                yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin)
+
+    def parse_pages(self, response):
+        for url in response.css('div.list-news a::attr(href)').getall():
+            yield response.follow(url=url, callback=self.parse_items, errback=self.errback_httpbin)
+        pagecount, pagecur = response.css('#page script').re(r'createPageHTMLV2\((\d+), (\d+)')
+        pagecount = eval(pagecount)
+        pagecur = eval(pagecur)
+        if pagecur + 1 < pagecount:
+            if pagecur == 0:
+                nextpage = response.url + 'index_1.html'
+            else:
+                nextpage = response.url[:-12] + 'index_{}.html'.format(pagecur+1)
+            yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin)
+
+    def parse_items(self, response):
+        url = response.url
+        title = response.css('title::text').get()
+        source = '南方电网报'
+        description = response.css('meta[name=description]::attr(content)').get()
+        content = "".join("".join(response.css("div.TRS_Editor p::text").getall()).split())
+        date = time.time()
+        column = ''
+        yield ElectricItem(url=url, title=title, source=source,
+                            description=description, content=content,
+                            date=date, column=column)
+
+    def errback_httpbin(self, failure):
+        self.logger.error(repr(failure))

+ 61 - 0
projects/electric/electric/spiders/ctg.py

@@ -0,0 +1,61 @@
+import time
+import scrapy
+from electric.items import ElectricItem
+
+
+# 三峡集团
+# 头条新闻、集团要闻、综合新闻
+class CtgSpider(scrapy.Spider):
+    name = 'ctg'
+    download_delay = 16
+    allowed_domains = ['ctg.com.cn']
+    start_urls = [
+        ('https://www.ctg.com.cn/eportal/ui?pageId=721173&currentPage={}&moduleId=47eb7057d63f4966a926bfa6fedc3648&staticRequest=yes', 791),
+        ('https://www.ctg.com.cn/eportal/ui?pageId=721176&currentPage={}&moduleId=ddb9046bc9ad46aba628ba0bcd74aea0&staticRequest=yes', 2798),
+        ('https://www.ctg.com.cn/eportal/ui?pageId=721179&currentPage={}&moduleId=5a38941454f8489b82222bf8308eaf87&staticRequest=yes', 2029),
+        ]
+
+    def start_requests(self):
+        for url, pagemax in [('https://www.ctg.com.cn/eportal/ui?pageId=721194&currentPage={}&moduleId=19e689e3ca304e74a9a16e2a046bc9d8&staticRequest=yes', 219)]:
+            pagecur = 1
+            starturl = url.format(pagecur)
+            yield scrapy.Request(url=starturl, callback=self.parse_pages1, errback=self.errback_httpbin, meta={"base_url":url, "pagecur": pagecur, "pagemax": pagemax}, dont_filter=True)
+        for url, pagemax in self.start_urls:
+            pagecur = 1
+            starturl = url.format(pagecur)
+            yield scrapy.Request(url=starturl, callback=self.parse_pages, errback=self.errback_httpbin, meta={"base_url":url, "pagecur": pagecur, "pagemax": pagemax}, dont_filter=True)
+
+    def parse_pages1(self, response):
+        for url in response.css('.content_ztbd a::attr(href)').getall():
+            yield response.follow(url=url, callback=self.parse_items, errback=self.errback_httpbin)
+        if response.meta['pagecur'] < response.meta['pagemax']:
+            nextpage = response.meta['base_url'].format(response.meta['pagecur'] + 1)
+            self.logger.info('next page: {}'.format(nextpage))
+            yield response.follow(url=nextpage, callback=self.parse_pages1, errback=self.errback_httpbin, meta={"base_url": response.meta['base_url'], "pagecur":
+                                response.meta['pagecur'] + 1, "pagemax": response.meta['pagemax']})
+
+    def parse_pages(self, response):
+        for item in response.css('div.content_zhxw'):
+            url = item.css('h4 a').attrib['href']
+            yield response.follow(url=url, callback=self.parse_items, errback=self.errback_httpbin)
+        if response.meta['pagecur'] < response.meta['pagemax']:
+            nextpage = response.meta['base_url'].format(response.meta['pagecur'] + 1)
+            self.logger.info('next page: {}'.format(nextpage))
+            yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin, meta={"base_url": response.meta['base_url'], "pagecur":
+                                response.meta['pagecur'] + 1, "pagemax": response.meta['pagemax']})
+
+    def parse_items(self, response):
+        url = response.url
+        title = response.css('title::text').get()
+        source = '中国长江三峡集团'
+        description = response.css('meta[name=description]::attr(content)').get()
+        content = ''.join(''.join(response.xpath('//div[@class="content_skdb"]//p//text()').getall()).split())
+        date = time.time()
+        column = ""
+        self.logger.info(title)
+        yield ElectricItem(url=url, title=title, source=source,
+                            description=description, content=content,
+                            date=date, column=column)
+
+    def errback_httpbin(self, failure):
+        self.logger.error(repr(failure))

+ 41 - 0
projects/electric/electric/spiders/cweea.py

@@ -0,0 +1,41 @@
+import time
+import scrapy
+from electric.items import ElectricItem
+
+# 风能产业网
+class CweeaSpider(scrapy.Spider):
+    name = 'cweea'
+    allowed_domains = ['cweea.com.cn']
+    start_urls = ['http://www.cweea.com.cn/xwdt/xkdt/']
+
+    custom_settings = {
+        'DOWNLOAD_DELAY': 10,
+    }
+
+    def start_requests(self):
+        for url in self.start_urls:
+            yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin)
+
+    def parse_pages(self, response):
+        for url in response.css('a.point::attr(href)').getall():
+            yield response.follow(url=url, callback=self.parse_item, errback=self.errback_httpbin)
+        try:
+            nextpage = response.xpath('//span[@class="JZD_PAGE_NEXT"]/a').attrib['href']
+            yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin)
+        except:
+            self.logger.info('Last page')
+
+    def parse_item(self, response):
+        url = response.url
+        title = response.css('title::text').get()
+        source = '风能产业网'
+        description = ''
+        content = ''.join(''.join(response.xpath('//div[@class="info"]/p/span/text()').getall()).split())
+        date = time.time()
+        column = '风能'
+        yield ElectricItem(url=url, title=title, source=source,
+                            description=description, content=content,
+                            date=date, column=column)
+
+    def errback_httpbin(self, failure):
+        self.logger.error(repr(failure))

+ 48 - 0
projects/electric/electric/spiders/eptchina.py

@@ -0,0 +1,48 @@
+import time
+import scrapy
+from electric.items import ElectricItem
+
+
+# 电力科技网
+# 核电、热电、电网、节能、科技
+class EptcSpider(scrapy.Spider):
+    name = 'eptc'
+    download_delay = 20
+    allowed_domains = ['eptchina.com']
+    start_urls = [
+        'http://www.eptchina.com/news/list-16.html',
+        'http://www.eptchina.com/news/list-17.html',
+        'http://www.eptchina.com/news/list-20.html',
+        'http://www.eptchina.com/news/list-19.html',
+        'http://www.eptchina.com/news/list-7.html',
+    ]
+
+    def start_requests(self):
+        for url in self.start_urls:
+            yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin)
+
+    def parse_pages(self, response):
+        for item in response.css('div.articlelist ul li'):
+            url = item.css('a').attrib['href']
+            yield response.follow(url=url, callback=self.parse_items, errback=self.errback_httpbin)
+        try:
+            nextpage = response.css('div.page a::attr(href)').getall()[-1]
+            yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin)
+        except:
+            self.logger.info('Last page')
+
+    def parse_items(self, response):
+        url = response.url
+        title = response.css('title::text').get()
+        source = '电力科技网'
+        description = response.css('meta[name=description]::attr(content)').get()
+        content = ''.join(''.join(response.xpath('//*[@id="ctrlfscont"]/div//text()').getall()).split())
+        date = time.time()
+        column = ''
+        self.logger.info(title)
+        yield ElectricItem(url=url, title=title, source=source,
+                            description=description, content=content,
+                            date=date, column=column)
+
+    def errback_httpbin(self, failure):
+        self.logger.error(repr(failure))

+ 45 - 0
projects/electric/electric/spiders/escn.py

@@ -0,0 +1,45 @@
+import time
+import scrapy
+from electric.items import ElectricItem
+
+
+# 中国储能网
+class EscnSpider(scrapy.Spider):
+    name = 'escn'
+    download_delay = 15
+    allowed_domains = ['escn.com.cn']
+    start_urls = ['http://www.escn.com.cn/']
+
+    def start_requests(self):
+        for url in self.start_urls:
+            yield scrapy.Request(url=url, callback=self.parse_classes, errback=self.errback_httpbin, dont_filter=True)
+
+    def parse_classes(self, response):
+        for url in response.css('.left-auto a.s_more::attr(href)').getall():
+            yield response.follow(url=url, callback=self.parse_pages, errback=self.errback_httpbin, dont_filter=True)
+
+    def parse_pages(self, response):
+        for url in response.css('.n-onelist a::attr(href)').getall():
+            yield response.follow(url=url, callback=self.parse_items, errback=self.errback_httpbin)
+        try:
+            nextpage = response.css('div.digg a').re(r'href="(.*?)">下一页')[0]
+            self.logger.info('next page: {}'.format(nextpage))
+            yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin)
+        except:
+            self.logger.info('Last page')
+
+    def parse_items(self, response):
+        url = response.url
+        title = response.css('title::text').get()
+        source = '中国储能网'
+        description = ''
+        content = "".join("".join(response.css("div.entry p::text").getall()).split())
+        date = time.time()
+        column = '储能'
+        self.logger.info(title)
+        yield ElectricItem(url=url, title=title, source=source,
+                            description=description, content=content,
+                            date=date, column=column)
+
+    def errback_httpbin(self, failure):
+        self.logger.error(repr(failure))

+ 47 - 0
projects/electric/electric/spiders/ewindpower.py

@@ -0,0 +1,47 @@
+import time
+import scrapy
+from electric.items import ElectricItem
+
+
+# 国际风能网
+class EwpcSpider(scrapy.Spider):
+    name = 'ewindpower'
+    download_delay = 15
+    allowed_domains = ['ewindpower.cn']
+    start_urls = [
+        'http://www.ewindpower.cn/news/list-htm-catid-15-page-{}.html',
+        'http://www.ewindpower.cn/news/list-htm-catid-14-page-{}.html',
+    ]
+
+    def start_requests(self):
+        yield scrapy.Request(url='http://www.ewindpower.cn/news/list-htm-catid-14.html', callback=self.parse_pages, errback=self.errback_httpbin)
+        yield scrapy.Request(url='http://www.ewindpower.cn/news/list-htm-catid-15.html', callback=self.parse_pages, errback=self.errback_httpbin)
+        for baseurl in self.start_urls:
+            for page in range(2, 12):
+                url = baseurl.format(page)
+                self.logger.info('next page: {}'.format(url))
+                yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin, dont_filter=True)
+
+    def parse_pages(self, response):
+        for item in response.css('div#iframe_11 > span > table li'):
+            url = item.css('a').attrib['href']
+            yield response.follow(url=url, callback=self.parse_items, errback=self.errback_httpbin)
+        # nextpage = response.css('div.pages a[title="下一页"]').attrib['href']
+        # self.logger.info('next page: {}'.format(nextpage))
+        # yield response.follow(url=url, callback=self.parse_pages, errback=self.errback_httpbin)
+
+    def parse_items(self, response):
+        url = response.url
+        title = response.css('title::text').get()
+        source = '国际风能网'
+        description = response.css('meta[name=description]::attr(content)').get()
+        content = response.xpath('//div[@class="content"]//text()').getall()
+        date = time.time()
+        column = '风能'
+        self.logger.info(title)
+        yield ElectricItem(url=url, title=title, source=source,
+                            description=description, content=content,
+                            date=date, column=column)
+
+    def errback_httpbin(self, failure):
+        self.logger.error(repr(failure))

+ 53 - 0
projects/electric/electric/spiders/gxepa.py

@@ -0,0 +1,53 @@
+import time
+import scrapy
+from electric.items import ElectricItem
+
+# 广西电力行业协会
+class GxepaSpider(scrapy.Spider):
+    name = 'gxepa'
+    allowed_domains = ['gxepa.org.cn']
+    start_urls = [
+        ('http://www.gxepa.org.cn/news_rdjj',102),
+        ('http://www.gxepa.org.cn/news_xhyw',31),
+        ('http://www.gxepa.org.cn/news_hyyw', 81),
+    ]
+
+    custom_settings = {
+        'DOWNLOAD_DELAY': 10,
+        'DOWNLOAD_TIMEOUT': 1800,
+    }
+
+    def start_requests(self):
+        for url, maxpage in self.start_urls:
+            yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin,
+                meta={"url": url, "nextpage": 2, 'maxpage': maxpage},
+                dont_filter=True)
+
+    def parse_pages(self, response):
+        for item in response.xpath('//div[@class="list-news"]/ul/li'):
+            url = item.css('a').attrib['href']
+            self.logger.info('parse item: {}'.format(url))
+            yield response.follow(url=url, callback=self.parse_item, errback=self.errback_httpbin)
+        if response.meta['nextpage'] <= response.meta['maxpage']:
+            nextpage = response.meta['url'] + '?pageNo={}'.format(response.meta['nextpage'])
+            self.logger.info('next page: {}'.format(nextpage))
+            yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin,
+                meta={'url': response.meta['url'], "nextpage": response.meta['nextpage'] + 1,
+                        'maxpage': response.meta['maxpage']},
+                dont_filter=True)
+
+    def parse_item(self, response):
+        url = response.url
+        title = response.css('title::text').get()
+        source = '广西电力行业协会'
+        description = ''
+        content = ''.join(''.join(response.xpath('//div[@id="ArtText"]//text()').getall()).split())
+        date = time.time()
+        column = ""
+        self.logger.info('title: {}'.format(title))
+        yield ElectricItem(url=url, title=title, source=source,
+                            description=description, content=content,
+                            date=date, column=column)
+
+    def errback_httpbin(self, failure):
+        self.logger.error(repr(failure))

+ 65 - 0
projects/electric/electric/spiders/iesplaza.py

@@ -0,0 +1,65 @@
+import time
+import scrapy
+from electric.items import ElectricItem
+# import scrapy_splash
+# from scrapy_splash import SplashRequest
+
+# from scrapy.shell import inspect_response
+# from scrapy.utils.response import open_in_browser
+
+
+# 综合能源服务网
+
+class IespiazaSpider(scrapy.Spider):
+    name = 'iesplaza'
+    download_delay = 10
+    allowed_domains = ['iesplaza.com']
+    start_urls = [
+        'https://www.iesplaza.com/news',
+        'https://www.iesplaza.com/focus',
+        'https://www.iesplaza.com/project',
+        'https://www.iesplaza.com/company',
+        'https://www.iesplaza.com/research',
+        'https://www.iesplaza.com/viewpoint',
+        'https://www.iesplaza.com/tech',
+        'https://www.iesplaza.com/market',
+        'https://www.iesplaza.com/case',
+    ]
+
+    def start_requests(self):
+        for url in self.start_urls:
+            yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin, dont_filter=True)
+            # yield SplashRequest(url=url, endpoint='render.html',
+            #                     args={"wait": 5, 'timeout': 90}, callback=self.parse_pages)
+
+    def parse_pages(self, response):
+        for item in response.css('div.list_msg'):
+            itemurl = item.css('a::attr(href)').get()
+            url = response.urljoin(itemurl)
+            self.logger.info('parse item: {}'.format(url))
+            yield scrapy.Request(url=url, callback=self.parse_items, errback=self.errback_httpbin)
+            # yield SplashRequest(url=url, endpoint='render.html',
+            #                     args={"wait": 5, 'timeout': 90}, callback=self.parse_items,
+            #                     meta={"date": date, "column": column})
+
+        nextpage = response.css('ul.pagination li:nth-last-child(1) > a::attr(href)').get()
+        url = response.urljoin(nextpage)
+        self.logger.info('next page: {}'.format(url))
+        yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin)
+        # yield SplashRequest(url=url, endpoint='render.html',
+        #                         args={"wait": 5, 'timeout': 90}, callback=self.parse_pages)
+
+    def parse_items(self, response):
+        url = response.url
+        title = response.css('title::text').get()
+        source = "综合能源服务网"
+        description = ''
+        content = "".join(response.css("#ct > div.text_msg p::text").getall())
+        date = time.time()
+        column = '智慧能源'
+        yield ElectricItem(url=url, title=title, source=source,
+                            description=description, content=content,
+                            date=date, column=column)
+
+    def errback_httpbin(self, failure):
+        self.logger.error(repr(failure))

+ 50 - 0
projects/electric/electric/spiders/nengyuanjie.py

@@ -0,0 +1,50 @@
+import time
+import scrapy
+from electric.items import ElectricItem
+
+# 能源界
+class NyjieSpider(scrapy.Spider):
+    name = 'nengyuanjie'
+    allowed_domains = ['nengyuanjie.net']
+    start_urls = [
+        ('http://www.nengyuanjie.net/series/chuneng.html',188,'储能'),
+        ('http://www.nengyuanjie.net/series/hedian.html',180,'核电'),
+        ('http://www.nengyuanjie.net/series/fengdian.html',188,'风电'),
+        ('http://www.nengyuanjie.net/series/guangfu.html',257,'光伏'),
+        ('http://www.nengyuanjie.net/series/nengyuanhulianwang.html',43,'计算机'),
+        ('http://www.nengyuanjie.net/series/qingneng.html',90,'氢能'),
+    ]
+
+    custom_settings = {
+        'DOWNLOAD_DELAY': 10,
+        'DOWNLOAD_TIMEOUT': 1800,
+    }
+
+    def start_requests(self):
+        for url, maxpage, column in self.start_urls:
+            yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin, meta={"url": url, "nextpage": 2, 'maxpage': maxpage, 'column': column}, dont_filter=True)
+
+    def parse_pages(self, response):
+        for item in response.xpath('//div[@class="lists"]/div[@class="li"]'):
+            url = item.css('a').attrib['href']
+            self.logger.info('parse item: {}'.format(url))
+            yield response.follow(url=url, callback=self.parse_item, errback=self.errback_httpbin, meta={'column': response.meta['column']})
+        if response.meta['nextpage'] <= response.meta['maxpage']:
+            nextpage = response.meta['url'] + '?page={}'.format(response.meta['nextpage'])
+            self.logger.info('next page: {}'.format(nextpage))
+            yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin, meta={'url': response.meta['url'], "nextpage": response.meta['nextpage'] + 1, 'maxpage': response.meta['maxpage'], 'column': response.meta['column']}, dont_filter=True)
+
+    def parse_item(self, response):
+        url = response.url
+        title = response.css('title::text').get()
+        source = '能源界'
+        description = response.css('meta[name=description]::attr(content)').get()
+        content = ''.join(''.join(response.xpath('//div[@class="content"]//text()').getall()).split())
+        date = time.time()
+        column = response.meta['column']
+        yield ElectricItem(url=url, title=title, source=source,
+                            description=description, content=content,
+                            date=date, column=column)
+
+    def errback_httpbin(self, failure):
+        self.logger.error(repr(failure))

+ 63 - 0
projects/electric/electric/spiders/newenergy.py

@@ -0,0 +1,63 @@
+import time
+import scrapy
+from electric.items import ElectricItem
+
+# 新能源网
+class NgccSpider(scrapy.Spider):
+    name = 'newenergy'
+    allowed_domains = ['newenergy.giec.cas.cn']
+    start_urls = [
+        ('http://newenergy.giec.cas.cn/tyn/xydt/index{}.html', 31, '光伏'),
+        ('http://newenergy.giec.cas.cn/tyn/jrjj/index{}.html', 31, '光伏'),
+        ('http://newenergy.giec.cas.cn/tyn/gfdt/index{}.html', 31, '光伏'),
+        ('http://newenergy.giec.cas.cn/tyn/grdt/index{}.html', 15, '光伏'),
+        ('http://newenergy.giec.cas.cn/tyn/jcyy/index{}.html', 31, '光伏'),
+        ('http://newenergy.giec.cas.cn/tyn/cpyjs/index{}.html', 31, '光伏'),
+        ('http://newenergy.giec.cas.cn/fn/jrjj_15706/index{}.html', 31, '风能'),
+        ('http://newenergy.giec.cas.cn/fn/fndt/index{}.html', 31, '风能'),
+        ('http://newenergy.giec.cas.cn/fn/cydt/index{}.html', 31, '风能'),
+        ('http://newenergy.giec.cas.cn/fn/fnzy/index{}.html', 2, '风能'),
+        ('http://newenergy.giec.cas.cn/fn/cpyjs_15710/index{}.html', 23, '风能'),
+        ('http://newenergy.giec.cas.cn/xsdt/index{}.html', 39, ''),
+    ]
+
+    custom_settings = {
+        'DOWNLOAD_DELAY': 6,
+        'DOWNLOAD_TIMEOUT': 1800,
+    }
+
+    def start_requests(self):
+        for baseurl, maxpage, tag in self.start_urls:
+            url = baseurl.format("")
+            yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin, dont_filter=True, meta={"url": baseurl.format('_{}'), "maxpage": maxpage, "nextpage": 1, "tag":tag})
+
+    def parse_pages(self, response):
+        try:
+            for item in response.xpath('//ul[@class="list_article"]/li'):
+                url = item.css('a').attrib['href']
+                yield response.follow(url=url, callback=self.parse_item, errback=self.errback_httpbin, meta={"url": url, "tag": response.meta['tag']})
+        except:
+            try:
+                for item in response.xpath('//div/table[not(@class)]//tr'):
+                    url = item.css('a').attrib['href']
+                    yield response.follow(url=url, callback=self.parse_item, errback=self.errback_httpbin, meta={"url": url, "tag": response.meta['tag']})
+            except:
+                pass
+        if response.meta['nextpage'] <= response.meta['maxpage']:
+            nextpage = response.meta['url'].format(response.meta['nextpage'])
+            yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin, meta={'url': response.meta['url'], "nextpage": response.meta['nextpage'] + 1, 'maxpage': response.meta['maxpage'], "tag": response.meta['tag']})
+
+    def parse_item(self, response):
+        url = response.url
+        title = response.css('title::text').get()
+        source = '新能源网'
+        description = ''
+        content = response.xpath('//div[@class="TRS_Editor"]//text()').getall()
+        date = time.time()
+        column = response.meta['tag']
+        yield ElectricItem(url=url, title=title, source=source,
+                            description=description, content=content,
+                            date=date, column=column)
+
+    def errback_httpbin(self, failure):
+        self.logger.error(repr(failure))

+ 42 - 0
projects/electric/electric/spiders/piec.py

@@ -0,0 +1,42 @@
+import time
+import scrapy
+from electric.items import ElectricItem
+
+# 国际电力网
+class PiecSpider(scrapy.Spider):
+    name = 'piec'
+    allowed_domains = ['power.in-en.com']
+    start_urls = ['https://power.in-en.com/visit/news/']
+    
+    custom_settings = {
+        'DOWNLOAD_DELAY': 6,
+        'DOWNLOAD_TIMEOUT': 1800,
+    }
+
+    def start_requests(self):
+        for url in self.start_urls:
+            yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin, dont_filter=True, meta={"url": url, "maxpage": 262, "nextpage": 2})
+
+    def parse_pages(self, response):
+        for item in response.xpath('//ul[@class="infoList"]/li/div[@class="listTxt"]'):
+            url = item.css('h5 > a').attrib['href']
+            yield response.follow(url=url, callback=self.parse_item, errback=self.errback_httpbin, meta={"url": url})
+        if response.meta['nextpage'] <= response.meta['maxpage']:
+            nextpage = response.meta['url'] + 'list801-{}.html'.format(response.meta['nextpage'])
+            self.logger.info('next page: {}'.format(nextpage))
+            yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin, meta={'url': response.meta['url'], "nextpage": response.meta['nextpage'] + 1, 'maxpage': response.meta['maxpage']})
+
+    def parse_item(self, response):
+        url = response.url
+        title = response.css('title::text').get()
+        source = '国际电力网'
+        description = response.css('meta[name=description]::attr(content)').get()
+        content = ''.join(''.join(response.xpath('//div[@id="article"]//text()').getall()).split())
+        date = time.time()
+        column = ''
+        yield ElectricItem(url=url, title=title, source=source,
+                            description=description, content=content,
+                            date=date, column=column)
+
+    def errback_httpbin(self, failure):
+        self.logger.error(repr(failure))

+ 56 - 0
projects/electric/electric/spiders/powerchina.py

@@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+import time
+
+import scrapy
+from electric.items import ElectricItem
+
+
+# 中国电建
+# 公司要闻、基层动态、国际项目、重点报道、国资动态、行业信息、媒体聚焦、科技动态
+class PcnSpider(scrapy.Spider):
+    name = 'powerchina'
+    download_delay = 15
+    allowed_domains = ['powerchina.cn']
+    start_urls = [
+        ('https://www.powerchina.cn/col/col7440/index.html',776),
+        ('https://www.powerchina.cn/col/col7442/index.html',4225),
+        ('https://www.powerchina.cn/col/col7449/index.html',1047),
+        ('https://www.powerchina.cn/col/col7450/index.html',1322),
+        ('https://www.powerchina.cn/col/col7457/index.html',231),
+        ('https://www.powerchina.cn/col/col7459/index.html',1110),
+        ('https://www.powerchina.cn/col/col7460/index.html',282),
+        ('https://www.powerchina.cn/col/col7461/index.html',279),
+    ]
+
+    def start_requests(self):
+        for url, maxpage in self.start_urls:
+            yield scrapy.Request(url=url, callback=self.parse, errback=self.errback_httpbin, dont_filter=True, meta={"url":url, "maxpage":maxpage, "curpage": 1})
+
+    def parse(self, response):
+        for curpage in range(1, response.meta['maxpage']):
+            nextpage = response.meta['url'] + r'?uid=46098&pageNum={}'.format(curpage + 1)
+            self.logger.info('next page: {}'.format(nextpage))
+            yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin, dont_filter=True, meta={"url":response.meta['url'], "maxpage":response.meta['maxpage'], "curpage": response.meta['curpage'] + 1})
+
+    def parse_pages(self, response):
+        for url in response.xpath('//div[@id="46098"]/script').re(r'href=\'(.*?)\''):
+            url = response.urljoin(url)
+            self.logger.info('parse items: {}'.format(url))
+            yield scrapy.Request(url=url, callback=self.parse_items, errback=self.errback_httpbin, dont_filter=True)
+
+    def parse_items(self, response):
+        url = response.url
+        title = response.css('title::text').get()
+        source = '中国电建'
+        description = response.css('meta[name=description]::attr(content)').get()
+        content = ''.join(''.join(response.css('div#zoom p::text').getall()).split())
+        date = time.time()
+        column = ''
+        self.logger.info(title)
+        yield ElectricItem(url=url, title=title, source=source,
+                            description=description, content=content,
+                            date=date, column=column)
+
+    def errback_httpbin(self, failure):
+        self.logger.error(repr(failure))
+

+ 45 - 0
projects/electric/electric/spiders/solarbe.py

@@ -0,0 +1,45 @@
+import time
+import scrapy
+from electric.items import ElectricItem
+
+
+# 索比光伏网
+# 光伏技术
+class SolarbeSpider(scrapy.Spider):
+    name = 'solarbe'
+    download_delay = 10
+    allowed_domains = ['solarbe.com']
+    start_urls = [
+        'https://news.solarbe.com/qiye/',
+        'https://news.solarbe.com/xiangmu/',
+        'https://news.solarbe.com/technology/'
+    ]
+
+    def start_requests(self):
+        for url in self.start_urls:
+            yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin, dont_filter=True)
+
+    def parse_pages(self, response):
+        for item in response.css('div.listleft div.newslistItem'):
+            url = item.css('a.newsTitle').attrib['href']
+            yield response.follow(url=url, callback=self.parse_items, errback=self.errback_httpbin)
+        try:
+            nextpage = response.css('div.pages a').re(r'href="(.*?)">')[-1]
+            yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin)
+        except:
+            self.logger.info('Last page')
+
+    def parse_items(self, response):
+        url = response.url
+        title = response.css('title::text').get()
+        source = "索比光伏网"
+        description = response.css('meta[name=description]::attr(content)').get()
+        content = ''.join(''.join(response.css('div.mainbody-body-con p::text').getall()).split())
+        date = time.time()
+        column = '光伏'
+        yield ElectricItem(url=url, title=title, source=source,
+                            description=description, content=content,
+                            date=date, column=column)
+
+    def errback_httpbin(self, failure):
+        self.logger.error(repr(failure))

+ 47 - 0
projects/electric/electric/spiders/solarenpv.py

@@ -0,0 +1,47 @@
+import time
+import scrapy
+from electric.items import ElectricItem
+
+# 光伏产业网
+class SolapvSpider(scrapy.Spider):
+    name = 'solarenpv'
+    allowed_domains = ['solarenpv.com']
+    start_urls = [
+        'http://www.solarenpv.com/',
+    ]
+
+    custom_settings = {
+        'DOWNLOAD_DELAY': 10,
+    }
+
+    def start_requests(self):
+        for url in self.start_urls:
+            yield scrapy.Request(url=url, callback=self.parse_classes, errback=self.errback_httpbin, dont_filter=True)
+
+    def parse_classes(self, response):
+        for url in response.css('.m_r .ibox_head a::attr(href)').getall():
+            yield response.follow(url=url, callback=self.parse_pages, errback=self.errback_httpbin, dont_filter=True)
+
+    def parse_pages(self, response):
+        for url in response.css(".catlist_li a::attr(href)").getall():
+            yield response.follow(url=url, callback=self.parse_item, errback=self.errback_httpbin)
+        try:
+            url = response.css('.pages a:nth-of-type(10)::attr(href)').get()
+            yield response.follow(url=url, callback=self.parse_pages, errback=self.errback_httpbin)
+        except:
+            self.logger.info('Last page')
+
+    def parse_item(self, response):
+        url = response.url
+        title = response.css('title::text').get()
+        source = '光伏产业网'
+        description = response.css('meta[name=description]::attr(content)').get()
+        content = ''.join(''.join(response.xpath('//div[@id="article"]//text()').getall()).split())
+        date = time.time()
+        column = '光伏'
+        yield ElectricItem(url=url, title=title, source=source,
+                            description=description, content=content,
+                            date=date, column=column)
+
+    def errback_httpbin(self, failure):
+        self.logger.error(repr(failure))

+ 55 - 0
projects/electric/electric/spiders/sungrow.py

@@ -0,0 +1,55 @@
+import scrapy
+from electric.items import ElectricItem
+
+# 阳光电源股份有限公司
+# 全
+# 完成
+class SungrowSpider(scrapy.Spider):
+    name = 'sungrow'
+    download_delay = 20
+    allowed_domains = ['yangg.solarbe.com']
+    start_urls = [
+        'https://yangg.solarbe.com/news/',
+        'https://yangg.solarbe.com/news/page-2.shtml',
+        'https://yangg.solarbe.com/news/page-3.shtml',
+        'https://yangg.solarbe.com/news/page-4.shtml',
+        'https://yangg.solarbe.com/news/page-5.shtml',
+        'https://yangg.solarbe.com/news/page-6.shtml',
+        'https://yangg.solarbe.com/news/page-7.shtml',
+        'https://yangg.solarbe.com/news/page-8.shtml',
+        'https://yangg.solarbe.com/news/page-9.shtml',
+        'https://yangg.solarbe.com/news/page-10.shtml',
+        'https://yangg.solarbe.com/news/page-11.shtml',
+        'https://yangg.solarbe.com/news/page-12.shtml',
+        'https://yangg.solarbe.com/news/page-13.shtml',
+    ]
+
+    def start_requests(self):
+        for url in self.start_urls:
+            yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin)
+
+    def parse_pages(self, response):
+        for item in response.xpath('//div[@class="main_body"]//tr'):
+            url = item.css('a').attrib['href']
+            date = item.css('td.f_gray').get()
+            self.logger.info('parse item: {}'.format(url))
+            yield response.follow(url=url, callback=self.parse_items, errback=self.errback_httpbin, meta={"date": date})
+        # nextpage = response.css("div.pages a").getall()[-1]
+        # self.logger.info('next page: {}'.format(nextpage))
+        # yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin)
+
+    def parse_items(self, response):
+        url = response.url
+        title = response.css('title::text').get()
+        source = '阳光电源股份有限公司'
+        description = response.css('meta[name=description]::attr(content)').get()
+        content = response.xpath('//*[@id="content"]//p//text()').getall()
+        date = response.meta['date']
+        column = response.css('meta[name=keywords]::attr(content)').get()
+        self.logger.info(title)
+        yield ElectricItem(url=url, title=title, source=source,
+                            description=description, content=content,
+                            date=date, column=column)
+
+    def errback_httpbin(self, failure):
+        self.logger.error(repr(failure))

+ 46 - 0
projects/electric/electric/spiders/twea.py

@@ -0,0 +1,46 @@
+import time
+
+import scrapy
+from electric.items import ElectricItem
+
+# 天津市新能源协会
+class TweaSpider(scrapy.Spider):
+    name = 'twea'
+    allowed_domains = ['twea.org.cn']
+    start_urls = [
+        ('http://www.twea.org.cn/?/article/', 39),
+        ('http://www.twea.org.cn/?/hyzixun/', 102),
+    ]
+
+    custom_settings = {
+        'DOWNLOAD_DELAY': 10,
+    }
+
+    def start_requests(self):
+        for url, maxpage in self.start_urls:
+            yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin, meta={"url": url, "nextpage": 2, 'maxpage': maxpage}, dont_filter=True)
+
+    def parse_pages(self, response):
+        for item in response.xpath('//ul[@class="main"]/li'):
+            url = item.css('a').attrib['href']
+            self.logger.info('parse item: {}'.format(url))
+            yield response.follow(url=url, callback=self.parse_item, errback=self.errback_httpbin)
+        if response.meta['nextpage'] <= response.meta['maxpage']:
+            nextpage = response.meta['url'] + 'page-{}/index.html'.format(response.meta['nextpage'])
+            self.logger.info('next page: {}'.format(nextpage))
+            yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin, meta={'url': response.meta['url'], "nextpage": response.meta['nextpage'] + 1, 'maxpage': response.meta['maxpage']})
+
+    def parse_item(self, response):
+        url = response.url
+        title = response.css('title::text').get()
+        source = '天津市新能源协会'
+        description = ""
+        content = ''.join(''.join(response.xpath('//div[@class="main"]/p/text()').getall()).split())
+        date = time.time()
+        column = ''
+        yield ElectricItem(url=url, title=title, source=source,
+                            description=description, content=content,
+                            date=date, column=column)
+
+    def errback_httpbin(self, failure):
+        self.logger.error(repr(failure))

+ 51 - 0
projects/electric/electric/spiders/xhhydropower.py

@@ -0,0 +1,51 @@
+import time
+import scrapy
+from electric.items import ElectricItem
+
+
+class XhslSpider(scrapy.Spider):
+    name = 'xhhydropower'
+    allowed_domains = ['xhhydropower.com']
+    start_urls = [
+        'https://www.xhhydropower.com/xhslfd/xwzx33/xhyw/index.html',
+    ]
+
+    custom_settings = {
+        'DOWNLOAD_DELAY': 15,
+        'DOWNLOAD_TIMEOUT': 1800,
+        'CLOSESPIDER_TIMEOUT': 0,
+        'CLOSESPIDER_ERRORCOUNT': 0,
+    }
+
+    def start_requests(self):
+        for url in self.start_urls:
+            yield scrapy.Request(url=url, callback=self.parse_classes, errback=self.errback_httpbin, dont_filter=True)
+
+    def parse_classes(self, response):
+        for url in response.css('.li_3 a.leftsubchecked01::attr(href)').getall():
+            yield response.follow(url=url, callback=self.parse_pages, errback=self.errback_httpbin, dont_filter=True)
+
+    def parse_pages(self, response):
+        for url in response.css('.ttone-title a::attr(href)').getall():
+            yield response.follow(url=url, callback=self.parse_item, errback=self.errback_httpbin)
+        try:
+            url = response.css('a[title="下一页"]').re('tagname="(.*?)"')[0]
+            yield response.follow(url=url, callback=self.parse_pages, errback=self.errback_httpbin)
+        except:
+            self.logger.info('Last page')
+
+    def parse_item(self, response):
+        url = response.url
+        title = response.css('title::text').get()
+        source = '新华水力发电'
+        description = ''
+        content = ''.join(''.join(response.xpath('//div[@class="content-box"]//text()').getall()).split())
+        date = time.time()
+        column = '水电'
+        self.logger.info('title: {}'.format(title))
+        yield ElectricItem(url=url, title=title, source=source,
+                            description=description, content=content,
+                            date=date, column=column)
+
+    def errback_httpbin(self, failure):
+        self.logger.error(repr(failure))

+ 45 - 0
projects/electric/electric/spiders/zzsolar.py

@@ -0,0 +1,45 @@
+import time
+import scrapy
+from electric.items import ElectricItem
+
+## 郑州国际太阳能光伏展览会
+class ZzsolarSpider(scrapy.Spider):
+    name = 'zzsolar'
+    allowed_domains = ['zzsolar.com.cn']
+    start_urls = [
+        ('https://zzsolar.com.cn/a/news/hyxw/list_13_{}.html', 122),
+    ]
+
+    custom_settings = {
+        'DOWNLOAD_DELAY': 10,
+    }
+
+    def start_requests(self):
+        for baseurl, maxpage in self.start_urls:
+            url = baseurl.format(1)
+            yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin, meta={"url": baseurl, "nextpage": 2, 'maxpage': maxpage}, dont_filter=True)
+
+    def parse_pages(self, response):
+        for item in response.xpath('//div[@class="cont thumblist1"]/ul/li'):
+            url = item.css('a.fl').attrib['href']
+            self.logger.info('parse item: {}'.format(url))
+            yield response.follow(url=url, callback=self.parse_item, errback=self.errback_httpbin)
+        if response.meta['nextpage'] <= response.meta['maxpage']:
+            nextpage = response.meta['url'].format(response.meta['nextpage'])
+            self.logger.info('next page: {}'.format(nextpage))
+            yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin, meta={'url': response.meta['url'], "nextpage": response.meta['nextpage'] + 1, 'maxpage': response.meta['maxpage']})
+    
+    def parse_item(self, response):
+        url = response.url
+        title = response.css('h1::text').get()
+        source = '郑州国际太阳能光伏展览会'
+        description = ''
+        content = response.xpath('//div[@class="showContxt"]//text()').getall()
+        date = time.time()
+        column = '光伏'
+        yield ElectricItem(url=url, title=title, source=source,
+                            description=description, content=content,
+                            date=date, column=column)
+
+    def errback_httpbin(self, failure):
+        self.logger.error(repr(failure))

+ 11 - 0
projects/electric/scrapy.cfg

@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = electric.settings
+
+[deploy:electric]
+url = http://localhost:6800/
+project = electric

+ 10 - 0
projects/electric/setup.py

@@ -0,0 +1,10 @@
+# Automatically created by: scrapydweb x scrapyd-client
+
+from setuptools import setup, find_packages
+
+setup(
+    name         = 'project',
+    version      = '1.0',
+    packages     = find_packages(),
+    entry_points = {'scrapy': ['settings = electric.settings']},
+)

+ 358 - 0
scrapydweb_settings_v10.py

@@ -0,0 +1,358 @@
+# coding: utf-8
+"""
+How ScrapydWeb works:
+BROWSER <<<>>> SCRAPYDWEB_BIND:SCRAPYDWEB_PORT <<<>>> your SCRAPYD_SERVERS
+
+GitHub: https://github.com/my8100/scrapydweb
+DOCS: https://github.com/my8100/files/blob/master/scrapydweb/README.md
+文档:https://github.com/my8100/files/blob/master/scrapydweb/README_CN.md
+"""
+import os
+
+
+############################## QUICK SETUP start ##############################
+############################## 快速设置 开始 ###################################
+# Setting SCRAPYDWEB_BIND to '0.0.0.0' or IP-OF-THE-CURRENT-HOST would make
+# ScrapydWeb server visible externally; Otherwise, set it to '127.0.0.1'.
+# The default is '0.0.0.0'.
+SCRAPYDWEB_BIND = '0.0.0.0'
+# Accept connections on the specified port, the default is 5000.
+SCRAPYDWEB_PORT = 5000
+
+# The default is False, set it to True to enable basic auth for the web UI.
+ENABLE_AUTH = False
+# In order to enable basic auth, both USERNAME and PASSWORD should be non-empty strings.
+USERNAME = ''
+PASSWORD = ''
+
+
+# Make sure that [Scrapyd](https://github.com/scrapy/scrapyd) has been installed
+# and started on all of your hosts.
+# Note that for remote access, you have to manually set 'bind_address = 0.0.0.0'
+# in the configuration file of Scrapyd and restart Scrapyd to make it visible externally.
+# Check out 'https://scrapyd.readthedocs.io/en/latest/config.html#example-configuration-file' for more info.
+# ------------------------------ Chinese --------------------------------------
+# 请先确保所有主机都已经安装和启动 [Scrapyd](https://github.com/scrapy/scrapyd)。
+# 如需远程访问 Scrapyd,则需在 Scrapyd 配置文件中设置 'bind_address = 0.0.0.0',然后重启 Scrapyd。
+# 详见 https://scrapyd.readthedocs.io/en/latest/config.html#example-configuration-file
+
+# - the string format: username:password@ip:port#group
+#   - The default port would be 6800 if not provided,
+#   - Both basic auth and group are optional.
+#   - e.g. '127.0.0.1:6800' or 'username:password@localhost:6801#group'
+# - the tuple format: (username, password, ip, port, group)
+#   - When the username, password, or group is too complicated (e.g. contains ':@#'),
+#   - or if ScrapydWeb fails to parse the string format passed in,
+#   - it's recommended to pass in a tuple of 5 elements.
+#   - e.g. ('', '', '127.0.0.1', '6800', '') or ('username', 'password', 'localhost', '6801', 'group')
+SCRAPYD_SERVERS = [
+    '127.0.0.1:6800',
+    # 'username:password@localhost:6801#group',
+    # ('username', 'password', 'localhost', '6801', 'group'),
+]
+
+
+# It's recommended to update the three options below
+# if both ScrapydWeb and one of your Scrapyd servers run on the same machine.
+# ------------------------------ Chinese --------------------------------------
+# 假如 ScrapydWeb 和某个 Scrapyd 运行于同一台主机,建议更新如下三个设置项。
+
+# If both ScrapydWeb and one of your Scrapyd servers run on the same machine,
+# ScrapydWeb would try to directly read Scrapy logfiles from disk, instead of making a request
+# to the Scrapyd server.
+# e.g. '127.0.0.1:6800' or 'localhost:6801', do not forget the port number.
+LOCAL_SCRAPYD_SERVER = '127.0.0.1:6800'
+
+# Enter the directory when you run Scrapyd, run the command below
+# to find out where the Scrapy logs are stored:
+# python -c "from os.path import abspath, isdir; from scrapyd.config import Config; path = abspath(Config().get('logs_dir')); print(path); print(isdir(path))"
+# Check out https://scrapyd.readthedocs.io/en/stable/config.html#logs-dir for more info.
+# e.g. 'C:/Users/username/logs' or '/home/username/logs'
+LOCAL_SCRAPYD_LOGS_DIR = './logs'
+
+# The default is False, set it to True to automatically run LogParser as a subprocess at startup.
+# Note that you can run the LogParser service separately via command 'logparser' as you like.
+# Run 'logparser -h' to find out the config file of LogParser for more advanced settings.
+# Visit https://github.com/my8100/logparser for more info.
+ENABLE_LOGPARSER = True
+############################## QUICK SETUP end ################################
+############################## 快速设置 结束 ###################################
+
+
+############################## ScrapydWeb #####################################
+# The default is False, set it to True and add both CERTIFICATE_FILEPATH and PRIVATEKEY_FILEPATH
+# to run ScrapydWeb in HTTPS mode.
+# Note that this feature is not fully tested, please leave your comment here if ScrapydWeb
+# raises any excepion at startup: https://github.com/my8100/scrapydweb/issues/18
+ENABLE_HTTPS = False
+# e.g. '/home/username/cert.pem'
+CERTIFICATE_FILEPATH = ''
+# e.g. '/home/username/cert.key'
+PRIVATEKEY_FILEPATH = ''
+
+
+############################## Scrapy #########################################
+# ScrapydWeb is able to locate projects in the SCRAPY_PROJECTS_DIR,
+# so that you can simply select a project to deploy, instead of packaging it in advance.
+# e.g. 'C:/Users/username/myprojects' or '/home/username/myprojects'
+SCRAPY_PROJECTS_DIR = './projects'
+
+
+############################## Scrapyd ########################################
+# ScrapydWeb would try every extension in sequence to locate the Scrapy logfile.
+# The default is ['.log', '.log.gz', '.txt'].
+SCRAPYD_LOG_EXTENSIONS = ['.log', '.log.gz', '.txt']
+
+
+############################## LogParser ######################################
+# Whether to backup the stats json files locally after you visit the Stats page of a job
+# so that it is still accessible even if the original logfile has been deleted.
+# The default is True, set it to False to disable this behaviour.
+BACKUP_STATS_JSON_FILE = True
+
+
+############################## Timer Tasks ####################################
+# Run ScrapydWeb with argument '-sw' or '--switch_scheduler_state', or click the ENABLED|DISABLED button
+# on the Timer Tasks page to turn on/off the scheduler for the timer tasks and the snapshot mechanism below.
+
+# The default is 300, which means ScrapydWeb would automatically create a snapshot of the Jobs page
+# and save the jobs info in the database in the background every 300 seconds.
+# Note that this behaviour would be paused if the scheduler for timer tasks is disabled.
+# Set it to 0 to disable this behaviour.
+JOBS_SNAPSHOT_INTERVAL = 300
+
+
+############################## Run Spider #####################################
+# The default is False, set it to True to automatically
+# expand the 'settings & arguments' section in the Run Spider page.
+SCHEDULE_EXPAND_SETTINGS_ARGUMENTS = False
+
+# The default is 'Mozilla/5.0', set it a non-empty string to customize the default value of `custom`
+# in the drop-down list of `USER_AGENT`.
+SCHEDULE_CUSTOM_USER_AGENT = 'Mozilla/5.0'
+
+# The default is None, set it to any value of ['custom', 'Chrome', 'iPhone', 'iPad', 'Android']
+# to customize the default value of `USER_AGENT`.
+SCHEDULE_USER_AGENT = None
+
+# The default is None, set it to True or False to customize the default value of `ROBOTSTXT_OBEY`.
+SCHEDULE_ROBOTSTXT_OBEY = None
+
+# The default is None, set it to True or False to customize the default value of `COOKIES_ENABLED`.
+SCHEDULE_COOKIES_ENABLED = None
+
+# The default is None, set it to a non-negative integer to customize the default value of `CONCURRENT_REQUESTS`.
+SCHEDULE_CONCURRENT_REQUESTS = None
+
+# The default is None, set it to a non-negative number to customize the default value of `DOWNLOAD_DELAY`.
+SCHEDULE_DOWNLOAD_DELAY = 5
+
+# The default is "-d setting=CLOSESPIDER_TIMEOUT=60\r\n-d setting=CLOSESPIDER_PAGECOUNT=10\r\n-d arg1=val1",
+# set it to '' or any non-empty string to customize the default value of `additional`.
+# Use '\r\n' as the line separator.
+SCHEDULE_ADDITIONAL = "-d setting=CLOSESPIDER_TIMEOUT=60\r\n-d setting=CLOSESPIDER_PAGECOUNT=10\r\n-d arg1=val1"
+
+
+############################## Page Display ###################################
+# The default is True, set it to False to hide the Items page, as well as
+# the Items column in the Jobs page.
+SHOW_SCRAPYD_ITEMS = True
+
+# The default is True, set it to False to hide the Job column in the Jobs page with non-database view.
+SHOW_JOBS_JOB_COLUMN = True
+
+# The default is 0, which means unlimited, set it to a positive integer so that
+# only the latest N finished jobs would be shown in the Jobs page with non-database view.
+JOBS_FINISHED_JOBS_LIMIT = 0
+
+# If your browser stays on the Jobs page, it would be reloaded automatically every N seconds.
+# The default is 300, set it to 0 to disable auto-reloading.
+JOBS_RELOAD_INTERVAL = 300
+
+# The load status of the current Scrapyd server is checked every N seconds,
+# which is displayed in the top right corner of the page.
+# The default is 10, set it to 0 to disable auto-refreshing.
+DAEMONSTATUS_REFRESH_INTERVAL = 10
+
+
+############################## Send Text ######################################
+########## usage in scrapy projects ##########
+# See the "Send Text" page
+
+########## slack ##########
+# How to create a slack app:
+# 1. Visit https://api.slack.com/apps and press the "Create New App" button.
+# 2. Enter your App Name (e.g. myapp)and select one of your Slack Workspaces, the press "Create App".
+# 3. Click the "OAuth & Permissions" menu in the sidebar on the left side of the page.
+# 4. Scroll down the page and find out "Select Permission Scopes" in the "Scopes" section
+# 5. Enter "send" and select "Send messages as <your-app-name>", then press "Save Changes"
+# 6. Scroll up the page and press "Install App to Workspace", then press "Install"
+# 7. Copy the "OAuth Access Token", e.g. xoxp-123-456-789-abcde
+# See https://api.slack.com/apps for more info
+
+# See step 1~7 above, e.g. 'xoxp-123-456-789-abcde'
+SLACK_TOKEN = os.environ.get('SLACK_TOKEN', '')
+# The default channel to use when sending text via slack, e.g. 'general'
+SLACK_CHANNEL = 'general'
+
+########## telegram ##########
+# How to create a telegram bot:
+# 1. Visit https://telegram.me/botfather to start a conversation with Telegram's bot that creates other bots.
+# 2. Send the /newbot command to create a new bot in a chat with BotFather.
+# 3. Follow the instructions to set up name and username (e.g. my_bot) for your bot.
+# 4. You would get a token (e.g. 123:abcde) after step 3.
+# 5. Visit telegram.me/<bot_username> (e.g. telegram.me/my_bot) and say hi to your bot to initiate a conversation.
+# 6. Visit https://api.telegram.org/bot<token-in-setp-4>/getUpdates to get the chat_id.
+#    (e.g. Visit https://api.telegram.org/bot123:abcde/getUpdates
+#     and you can find the chat_id in "chat":{"id":123456789,...)
+# See https://core.telegram.org/bots#6-botfather for more info
+
+# See step 1~4 above, e.g. '123:abcde'
+TELEGRAM_TOKEN = os.environ.get('TELEGRAM_TOKEN', '')
+# See step 5~6 above, e.g. 123456789
+TELEGRAM_CHAT_ID = int(os.environ.get('TELEGRAM_CHAT_ID', 0))
+
+########## email ##########
+# The default subject to use when sending text via email.
+EMAIL_SUBJECT = 'Email from #scrapydweb'
+
+########## email sender & recipients ##########
+# Leave this option as '' to default to the EMAIL_SENDER option below; Otherwise, set it up
+# if your email service provider requires an username which is different from the EMAIL_SENDER option below to login.
+# e.g. 'username'
+EMAIL_USERNAME = ''
+# As for different email service provider, you might have to get an APP password (like Gmail)
+# or an authorization code (like QQ mail) and set it as the EMAIL_PASSWORD.
+# Check out links below to get more help:
+# https://stackoverflow.com/a/27515833/10517783 How to send an email with Gmail as the provider using Python?
+# https://stackoverflow.com/a/26053352/10517783 Python smtplib proxy support
+# e.g. 'password4gmail'
+EMAIL_PASSWORD = os.environ.get('EMAIL_PASSWORD', '')
+
+# e.g. 'username@gmail.com'
+EMAIL_SENDER = ''
+# e.g. ['username@gmail.com', ]
+EMAIL_RECIPIENTS = [EMAIL_SENDER]
+
+########## email smtp settings ##########
+# Check out this link if you are using ECS of Alibaba Cloud and your SMTP server provides TCP port 25 only:
+# https://www.alibabacloud.com/help/doc-detail/56130.htm
+# Config for https://mail.google.com using SSL: ('smtp.gmail.com', 465, True)
+# Config for https://mail.google.com:           ('smtp.gmail.com', 587, False)
+# Config for https://mail.qq.com using SSL:     ('smtp.qq.com', 465, True)
+# Config for http://mail.10086.cn:              ('smtp.139.com', 25, False)
+SMTP_SERVER = ''
+SMTP_PORT = 0
+SMTP_OVER_SSL = False
+# The timeout in seconds for the connection attempt, the default is 30.
+SMTP_CONNECTION_TIMEOUT = 30
+
+
+############################## Monitor & Alert ################################
+# The default is False, set it to True to launch the poll subprocess to monitor your crawling jobs.
+ENABLE_MONITOR = False
+
+########## poll interval ##########
+# Tip: In order to be notified (and stop or forcestop a job when triggered) in time,
+# you can reduce the value of POLL_ROUND_INTERVAL and POLL_REQUEST_INTERVAL,
+# at the cost of burdening both CPU and bandwidth of your servers.
+
+# Sleep N seconds before starting next round of poll, the default is 300.
+POLL_ROUND_INTERVAL = 300
+# Sleep N seconds between each request to the Scrapyd server while polling, the default is 10.
+POLL_REQUEST_INTERVAL = 10
+
+########## alert switcher ##########
+# Tip: Set the SCRAPYDWEB_BIND option the in "QUICK SETUP" section to the actual IP of your host,
+# then you can visit ScrapydWeb via the links attached in the alert.
+
+# The default is False, set it to True to enable alert via Slack, Telegram, or Email.
+# You have to set up your accounts in the "Send text" section above first.
+ENABLE_SLACK_ALERT = False
+ENABLE_TELEGRAM_ALERT = False
+ENABLE_EMAIL_ALERT = False
+
+########## alert working time ##########
+# Monday is 1 and Sunday is 7.
+# e.g, [1, 2, 3, 4, 5, 6, 7]
+ALERT_WORKING_DAYS = []
+
+# From 0 to 23.
+# e.g. [9] + list(range(15, 18)) >>> [9, 15, 16, 17], or range(24) for 24 hours
+ALERT_WORKING_HOURS = []
+
+########## basic triggers ##########
+# Trigger alert every N seconds for each running job.
+# The default is 0, set it to a positive integer to enable this trigger.
+ON_JOB_RUNNING_INTERVAL = 0
+
+# Trigger alert when a job is finished.
+# The default is False, set it to True to enable this trigger.
+ON_JOB_FINISHED = False
+
+########## advanced triggers ##########
+# - LOG_XXX_THRESHOLD:
+#   - Trigger alert the first time reaching the threshold for a specific kind of log.
+#   - The default is 0, set it to a positive integer to enable this trigger.
+# - LOG_XXX_TRIGGER_STOP (optional):
+#   - The default is False, set it to True to stop current job automatically when reaching the LOG_XXX_THRESHOLD.
+#   - The SIGTERM signal would be sent only one time to shut down the crawler gracefully.
+#   - In order to avoid an UNCLEAN shutdown, the 'STOP' action would be executed one time at most
+#   - if none of the 'FORCESTOP' triggers is enabled, no matter how many 'STOP' triggers are enabled.
+# - LOG_XXX_TRIGGER_FORCESTOP (optional):
+#   - The default is False, set it to True to FORCESTOP current job automatically when reaching the LOG_XXX_THRESHOLD.
+#   - The SIGTERM signal would be sent twice resulting in an UNCLEAN shutdown, without the Scrapy stats dumped!
+#   - The 'FORCESTOP' action would be executed if both of the 'STOP' and 'FORCESTOP' triggers are enabled.
+
+# Note that the 'STOP' action and the 'FORCESTOP' action would still be executed even when the current time
+# is NOT within the ALERT_WORKING_DAYS and the ALERT_WORKING_HOURS, though no alert would be sent.
+
+LOG_CRITICAL_THRESHOLD = 0
+LOG_CRITICAL_TRIGGER_STOP = False
+LOG_CRITICAL_TRIGGER_FORCESTOP = False
+
+LOG_ERROR_THRESHOLD = 0
+LOG_ERROR_TRIGGER_STOP = False
+LOG_ERROR_TRIGGER_FORCESTOP = False
+
+LOG_WARNING_THRESHOLD = 0
+LOG_WARNING_TRIGGER_STOP = False
+LOG_WARNING_TRIGGER_FORCESTOP = False
+
+LOG_REDIRECT_THRESHOLD = 0
+LOG_REDIRECT_TRIGGER_STOP = False
+LOG_REDIRECT_TRIGGER_FORCESTOP = False
+
+LOG_RETRY_THRESHOLD = 0
+LOG_RETRY_TRIGGER_STOP = False
+LOG_RETRY_TRIGGER_FORCESTOP = False
+
+LOG_IGNORE_THRESHOLD = 0
+LOG_IGNORE_TRIGGER_STOP = False
+LOG_IGNORE_TRIGGER_FORCESTOP = False
+
+
+############################## System #########################################
+# The default is False, set it to True to enable debug mode and the interactive debugger
+# would be shown in the browser instead of the "500 Internal Server Error" page.
+# Note that use_reloader is set to False in run.py
+DEBUG = False
+
+# The default is False, set it to True to change the logging level from INFO to DEBUG
+# for getting more information about how ScrapydWeb works, especially while debugging.
+VERBOSE = False
+
+# The default is '', which means saving all program data in the Python directory.
+# e.g. 'C:/Users/username/scrapydweb_data' or '/home/username/scrapydweb_data'
+DATA_PATH = os.environ.get('DATA_PATH', '')
+
+# The default is '', which means saving data of Jobs and Timer Tasks in DATA_PATH using SQLite.
+# The data could be also saved in MySQL or PostgreSQL backend in order to improve concurrency.
+# To use MySQL backend, run command: pip install --upgrade pymysql
+# To use PostgreSQL backend, run command: pip install --upgrade psycopg2
+# e.g.
+# 'mysql://username:password@127.0.0.1:3306'
+# 'postgres://username:password@127.0.0.1:5432'
+# 'sqlite:///C:/Users/username'
+# 'sqlite:////home/username'
+DATABASE_URL = os.environ.get('DATABASE_URL', '')

+ 9 - 0
spiders.conf

@@ -0,0 +1,9 @@
+bjx
+cecn
+ceec
+ceeia
+chinapower
+chinapv
+chng
+cnen
+cnnpn','cny','cpnn','csg','ctg','cweea','eptc','escn','ewindpower','gxepa','iesplaza','nengyuanjie','newenergy','piec','powerchina','solarbe','solarenpv','sungrow','twea','xhhydropower','zzsolar']

+ 7 - 0
start.sh

@@ -0,0 +1,7 @@
+#!/bin/sh
+nohup scrapyd > scrapyd.log 2>&1 &
+export Back_End_Ip=192.168.1.203
+export Back_End_port=11031
+export ProjectName=electric
+
+./timertask.py

+ 57 - 0
timertask.py

@@ -0,0 +1,57 @@
+#!/usr/local/bin/python
+# -*- coding: utf-8 -*-
+# @Last Modified time: 2022-02-24 09:43:13
+# 
+# 爬虫批量定时任务
+
+import os
+import json
+import time
+import datetime
+import logging
+import requests
+from requests.adapters import HTTPAdapter
+
+from apscheduler.schedulers.blocking import BlockingScheduler
+from apscheduler.events import EVENT_JOB_EXECUTED, EVENT_JOB_ERROR
+
+
+logging.basicConfig(level=logging.INFO,
+	filename='timertask.log',
+	format='%(asctime)s:%(levelname)s:%(message)s'
+)
+
+sched = BlockingScheduler(timezone="Asia/Shanghai")
+spiderlist = ['bjx','cecn','ceec','ceeia','chinapower','chinapv','chng','cnen','cnnpn','cny','cpnn','csg','ctg','cweea','eptc','escn','ewindpower','gxepa','iesplaza','nengyuanjie','newenergy','piec','powerchina','solarbe','solarenpv','sungrow','twea','xhhydropower','zzsolar']
+
+# 从后端获取爬虫列表
+def get_spiders():
+    # 后端 ip
+    ip = os.environ.get("Back_End_Ip", "192.168.1.203")
+    # 后端 port
+    port = os.environ.get("Back_End_Port", 11031)
+    # 请求后端数据库
+    url = 'http://{}:{}/resource/judge'.format(ip,port)
+    session = requests.Session()
+    session.mount('http://', HTTPAdapter(max_retries = 3))
+    try:
+    	response = session.get(url, timeout=10)
+        # 返回运行列表
+        return json.loads(response.text)['data']['running']
+    except requests.exceptions.RequestException as e:
+        print(e)
+
+# 运行任务
+@sched.scheduled_job('cron', hour=1)
+def spiders_job():
+    # 获取运行列表
+    spiders = get_spiders()
+    # 执行任务
+    for spider in spiders:
+        if spider in spiderlist:
+            data = {'project':os.environ.get("ProjectName", ""),'spider':spider,'jobid':datetime.datetime.now().strftime("%Y-%m-%dT%H_%M_%S")}
+            response = requests.post(url='http://localhost:6800/schedule.json', data=data)
+            logging.info(response.text)
+            time.sleep(2)
+
+sched.start()