3 年之前 · 6f7bd5e70d
--- a/README.md
+++ b/README.md
--- a/projects/electric/electric/__init__.py
+++ b/projects/electric/electric/__init__.py
@@ -0,0 +1,5 @@
 
				+# -*- coding: utf-8 -*-

			
 
				+# @Author: sprivacy

			
 
				+# @Date:   2022-01-26 15:38:32

			
 
				+# @Last Modified by:   sprivacy

			
 
				+# @Last Modified time: 2022-02-11 10:07:31

			
--- a/projects/electric/electric/items.py
+++ b/projects/electric/electric/items.py
@@ -0,0 +1,25 @@
 
				+# Define here the models for your scraped items
			
 
				+#
			
 
				+# See documentation in:
			
 
				+# https://docs.scrapy.org/en/latest/topics/items.html
			
 
				+
			
 
				+import scrapy
			
 
				+
			
 
				+
			
 
				+class ElectricItem(scrapy.Item):
			
 
				+    # define the fields for your item here like:
			
 
				+    # name = scrapy.Field()
			
 
				+    url = scrapy.Field(serializer=str)
			
 
				+    title = scrapy.Field(serializer=str)
			
 
				+    source = scrapy.Field(serializer=str)
			
 
				+    description = scrapy.Field(serializer=str)
			
 
				+    content = scrapy.Field(serializer=str)
			
 
				+    date = scrapy.Field(serializer=str)
			
 
				+    column = scrapy.Field(serializer=str)
			
 
				+
			
 
				+class PatentItem(scrapy.Item):
			
 
				+	nam = scrapy.Field(serializer=str)
			
 
				+	num = scrapy.Field(serializer=str)
			
 
				+	org = scrapy.Field(serializer=str)
			
 
				+	per = scrapy.Field(serializer=str)
			
 
				+	des = scrapy.Field(serializer=str)
			
--- a/projects/electric/electric/middlewares.py
+++ b/projects/electric/electric/middlewares.py
@@ -0,0 +1,138 @@
 
				+# Define here the models for your spider middleware
			
 
				+#
			
 
				+# See documentation in:
			
 
				+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
			
 
				+
			
 
				+# -----user code start-------
			
 
				+import time
			
 
				+import random
			
 
				+from scrapy.http.response.html import HtmlResponse
			
 
				+# -----user code end---------
			
 
				+
			
 
				+from scrapy import signals
			
 
				+
			
 
				+# -----user code start-------
			
 
				+from itemadapter import is_item, ItemAdapter
			
 
				+
			
 
				+from electric.settings import USER_AGENTS
			
 
				+from electric.settings import PROXIES
			
 
				+# -----user code end---------
			
 
				+
			
 
				+# useful for handling different item types with a single interface
			
 
				+
			
 
				+class ProxyMiddleware(object):
			
 
				+    def process_requset(self, request, spider):
			
 
				+        proxy = random.choice(PROXIES)
			
 
				+        request.meta['proxy'] = "http://" + proxy['ip_port']
			
 
				+
			
 
				+class RandomUserAgentMiddleware(object):
			
 
				+    def process_request(self, request, spider):
			
 
				+        user_agnet = random.choice(USER_AGENTS)
			
 
				+        request.headers['User-Agent'] = user_agnet
			
 
				+
			
 
				+
			
 
				+class ElectricSpiderMiddleware:
			
 
				+    # Not all methods need to be defined. If a method is not defined,
			
 
				+    # scrapy acts as if the spider middleware does not modify the
			
 
				+    # passed objects.
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_crawler(cls, crawler):
			
 
				+        # This method is used by Scrapy to create your spiders.
			
 
				+        s = cls()
			
 
				+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
			
 
				+        return s
			
 
				+
			
 
				+    def process_spider_input(self, response, spider):
			
 
				+        # Called for each response that goes through the spider
			
 
				+        # middleware and into the spider.
			
 
				+
			
 
				+        # Should return None or raise an exception.
			
 
				+        return None
			
 
				+
			
 
				+    def process_spider_output(self, response, result, spider):
			
 
				+        # Called with the results returned from the Spider, after
			
 
				+        # it has processed the response.
			
 
				+
			
 
				+        # Must return an iterable of Request, or item objects.
			
 
				+        for i in result:
			
 
				+            yield i
			
 
				+
			
 
				+    def process_spider_exception(self, response, exception, spider):
			
 
				+        # Called when a spider or process_spider_input() method
			
 
				+        # (from other spider middleware) raises an exception.
			
 
				+
			
 
				+        # Should return either None or an iterable of Request or item objects.
			
 
				+        pass
			
 
				+
			
 
				+    def process_start_requests(self, start_requests, spider):
			
 
				+        # Called with the start requests of the spider, and works
			
 
				+        # similarly to the process_spider_output() method, except
			
 
				+        # that it doesn’t have a response associated.
			
 
				+
			
 
				+        # Must return only requests (not items).
			
 
				+        for r in start_requests:
			
 
				+            yield r
			
 
				+
			
 
				+    def spider_opened(self, spider):
			
 
				+        spider.logger.info('Spider opened: %s' % spider.name)
			
 
				+
			
 
				+
			
 
				+class ElectricDownloaderMiddleware:
			
 
				+    # Not all methods need to be defined. If a method is not defined,
			
 
				+    # scrapy acts as if the downloader middleware does not modify the
			
 
				+    # passed objects.
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_crawler(cls, crawler):
			
 
				+        # This method is used by Scrapy to create your spiders.
			
 
				+        s = cls()
			
 
				+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
			
 
				+        return s
			
 
				+
			
 
				+    def process_request(self, request, spider):
			
 
				+        # Called for each request that goes through the downloader
			
 
				+        # middleware.
			
 
				+
			
 
				+        # Must either:
			
 
				+        # - return None: continue processing this request
			
 
				+        # - or return a Response object
			
 
				+        # - or return a Request object
			
 
				+        # - or raise IgnoreRequest: process_exception() methods of
			
 
				+        #   installed downloader middleware will be called
			
 
				+
			
 
				+        # -----user code start-------
			
 
				+        print("use selenium request url {}".format(request.url))
			
 
				+        # self.driver.implicitly_wait(10)
			
 
				+        try:
			
 
				+            spider.driver.get(request.url)
			
 
				+            time.sleep(5)
			
 
				+            spider.driver.execute_script('document.getElementById("J-global-toolbar").scrollIntoView()')
			
 
				+            time.sleep(5)
			
 
				+            self.now_page = spider.driver.page_source
			
 
				+            return HtmlResponse(url=request.url, body=self.now_page, request=request, encoding='utf-8', status=200)
			
 
				+        except:
			
 
				+            return None
			
 
				+        # -----user code end---------
			
 
				+
			
 
				+    def process_response(self, request, response, spider):
			
 
				+        # Called with the response returned from the downloader.
			
 
				+
			
 
				+        # Must either;
			
 
				+        # - return a Response object
			
 
				+        # - return a Request object
			
 
				+        # - or raise IgnoreRequest
			
 
				+        return response
			
 
				+
			
 
				+    def process_exception(self, request, exception, spider):
			
 
				+        # Called when a download handler or a process_request()
			
 
				+        # (from other downloader middleware) raises an exception.
			
 
				+
			
 
				+        # Must either:
			
 
				+        # - return None: continue processing this exception
			
 
				+        # - return a Response object: stops process_exception() chain
			
 
				+        # - return a Request object: stops process_exception() chain
			
 
				+        pass
			
 
				+
			
 
				+    def spider_opened(self, spider):
			
 
				+        spider.logger.info('Spider opened: %s' % spider.name)
			
--- a/projects/electric/electric/pipelines.py
+++ b/projects/electric/electric/pipelines.py
@@ -0,0 +1,99 @@
 
				+# Define your item pipelines here
			
 
				+#
			
 
				+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
			
 
				+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
			
 
				+
			
 
				+
			
 
				+# useful for handling different item types with a single interface
			
 
				+import json
			
 
				+import logging
			
 
				+import requests
			
 
				+from datetime import datetime
			
 
				+import pymongo
			
 
				+from itemadapter import ItemAdapter
			
 
				+from electric.items import ElectricItem
			
 
				+
			
 
				+logging.basicConfig(level=logging.INFO,
			
 
				+    filename='push_url_error.log',
			
 
				+    format='%(asctime)s:%(levelname)s:%(message)s'
			
 
				+)
			
 
				+
			
 
				+class ElectricPipeline:
			
 
				+    def process_item(self, item, spider):
			
 
				+        return item
			
 
				+
			
 
				+# 原始库
			
 
				+class ElectricMongoDBPipeline:
			
 
				+    """docstring for MongodbPipline"""
			
 
				+    @classmethod
			
 
				+    def from_crawler(cls, crawler):
			
 
				+        cls.DB_URL = crawler.settings.get('MONGO_DB_URI', 'mongodb://localhost:27017')
			
 
				+        cls.DB_NAME = crawler.settings.get('MONGO_DB_NAME', 'electric')
			
 
				+        return cls()
			
 
				+
			
 
				+    def open_spider(self, spider):
			
 
				+        self.client = pymongo.MongoClient(self.DB_URL)
			
 
				+        self.db = self.client[self.DB_NAME]
			
 
				+
			
 
				+    def close_spider(self, spider):
			
 
				+        self.client.close()
			
 
				+
			
 
				+    def process_item(self, item, spider):
			
 
				+        if isinstance(item, ElectricItem):
			
 
				+            collection = self.db["raw_news"]
			
 
				+            adapter = ItemAdapter(item)
			
 
				+            postItem = adapter.asdict()
			
 
				+            collection.insert_one(postItem)
			
 
				+            return item
			
 
				+        else:
			
 
				+            return item
			
 
				+
			
 
				+# 临时存储
			
 
				+class TempMongoDBPipeline:
			
 
				+    @classmethod
			
 
				+    def from_crawler(cls, crawler):
			
 
				+        cls.DB_URL = crawler.settings.get('MONGO_DB_URI', 'mongodb://localhost:27017')
			
 
				+        cls.DB_NAME = crawler.settings.get('MONGO_DB_NAME', 'electric')
			
 
				+        return cls()
			
 
				+
			
 
				+    def open_spider(self, spider):
			
 
				+        self.client = pymongo.MongoClient(self.DB_URL)
			
 
				+        self.db = self.client[self.DB_NAME]
			
 
				+
			
 
				+    def close_spider(self, spider):
			
 
				+        self.client.close()
			
 
				+
			
 
				+    def process_item(self, item, spider):
			
 
				+        if isinstance(item, ElectricItem):
			
 
				+            collection = self.db["temp"]
			
 
				+            adapter = ItemAdapter(item)
			
 
				+            postItem = adapter.asdict()
			
 
				+            collection.insert_one(postItem)
			
 
				+            return item
			
 
				+        else:
			
 
				+            return item
			
 
				+
			
 
				+# 推送爬取记录
			
 
				+class PushUrlPipeline:
			
 
				+    @classmethod
			
 
				+    def from_crawler(cls, crawler):
			
 
				+        cls.Push_URL = crawler.settings.get('PUSH_URI', 'http://localhost:9999/talent/insertUrlTime')
			
 
				+        cls.headers = {
			
 
				+            'contentType':'Application/json'
			
 
				+        }
			
 
				+        return cls
			
 
				+
			
 
				+    def process_item(self, item, spider):
			
 
				+        if isinstance(item ElectricItem):
			
 
				+            adapter = ItemAdapter(item)
			
 
				+            data = {
			
 
				+                "url": adapter['url'],
			
 
				+                'createTime': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
			
 
				+                'name': adapter['source']
			
 
				+            }
			
 
				+            response = requests.post(url=url, headers=headers, json=data)
			
 
				+            if josn.loads(response.text)['code'] != 200:
			
 
				+                logging.error(res)
			
 
				+            return item
			
 
				+        else:
			
 
				+            return item
			
--- a/projects/electric/electric/settings.py
+++ b/projects/electric/electric/settings.py
@@ -0,0 +1,142 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+# Scrapy settings for electric project
			
 
				+
			
 
				+
			
 
				+# 自动生成的配置，无需关注，不用修改
			
 
				+BOT_NAME = 'electric'
			
 
				+SPIDER_MODULES = ['electric.spiders']
			
 
				+NEWSPIDER_MODULE = 'electric.spiders'
			
 
				+
			
 
				+
			
 
				+LOG_LEVEL = "INFO"
			
 
				+LOG_ENABLED = True
			
 
				+
			
 
				+
			
 
				+FEED_EXPORT_ENCODING = 'utf-8'
			
 
				+
			
 
				+MIN_RANDOM_DELAY: 5
			
 
				+MAX_RANDOM_DELAY: 300
			
 
				+
			
 
				+# Obey robots.txt rules
			
 
				+ROBOTSTXT_OBEY = False
			
 
				+
			
 
				+# 并发请求的最大值 (default: 16)
			
 
				+# CONCURRENT_REQUESTS = 16
			
 
				+
			
 
				+# 单个网站的下载延迟 (default: 0)
			
 
				+DOWNLOAD_DELAY = 10
			
 
				+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
			
 
				+# 下载延迟设置将只满足以下条件之一,默认启用PER_DOMAIN:
			
 
				+CONCURRENT_REQUESTS_PER_DOMAIN = 1 # 单个网站的并发请求值
			
 
				+# CONCURRENT_REQUESTS_PER_IP = 2 # 单个IP的并发请求值,同时DOWNLOAD_DELAY也成了相同IP两个请求间的间隔了
			
 
				+
			
 
				+
			
 
				+# Disable cookies (enabled by default)
			
 
				+COOKIES_ENABLED = False
			
 
				+
			
 
				+# 禁用Telnet控制台 (enabled by default)
			
 
				+TELNETCONSOLE_ENABLED = False
			
 
				+
			
 
				+# 覆盖默认的请求头, 这里基本上不用:
			
 
				+# USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
			
 
				+# DEFAULT_REQUEST_HEADERS = {
			
 
				+#     'User-Agent': USER_AGENT,
			
 
				+#     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
			
 
				+#     'Accept-Language': 'en',
			
 
				+# }
			
 
				+
			
 
				+SPLASH_URL = 'http://localhost:8050'
			
 
				+
			
 
				+# 爬虫中间件
			
 
				+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
			
 
				+# SPIDER_MIDDLEWARES = {
			
 
				+#     'electric.middlewares.ElectricSpiderMiddleware': 543,
			
 
				+#     'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
			
 
				+# }
			
 
				+
			
 
				+# 下载中间件
			
 
				+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
			
 
				+DOWNLOADER_MIDDLEWARES = {
			
 
				+#     "electric.middlewares.RandomDelayMiddleware": 400,
			
 
				+#     'electric.middlewares.ProxyMiddleware': 500,      
			
 
				+    'electric.middlewares.RandomUserAgentMiddleware': 543,
			
 
				+#     'scrapy_splash.SplashCookiesMiddleware': 723,
			
 
				+#     'scrapy_splash.SplashMiddleware': 725,
			
 
				+#     'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
			
 
				+}
			
 
				+
			
 
				+# DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
			
 
				+
			
 
				+DNSCACHE_ENABLED = True
			
 
				+RETRY_ENABLED = False
			
 
				+
			
 
				+# 扩展程序
			
 
				+# See https://docs.scrapy.org/en/latest/topics/extensions.html
			
 
				+#EXTENSIONS = {
			
 
				+#    'scrapy.extensions.telnet.TelnetConsole': None,
			
 
				+#}
			
 
				+
			
 
				+# 项目管道
			
 
				+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
			
 
				+ITEM_PIPELINES = {
			
 
				+    # 'electric.pipelines.ElectricPipeline': 301,
			
 
				+    # 原始数据库
			
 
				+    'electric.pipelines.ElectricMongoDBPipeline': 300,
			
 
				+    # 临时数据库
			
 
				+    'electric.pipelines.TempMongoDBPipeline': 301,
			
 
				+#    'electric.pipelines.PushUrlPipeline': 400,
			
 
				+}
			
 
				+
			
 
				+# 启用并配置自动限速扩展 (disabled by default)
			
 
				+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
			
 
				+AUTOTHROTTLE_ENABLED = True
			
 
				+# 初始下载延迟
			
 
				+AUTOTHROTTLE_START_DELAY = 5
			
 
				+# 在高延迟下的最大延迟
			
 
				+AUTOTHROTTLE_MAX_DELAY = 300
			
 
				+# Scrapy应向每个远程服务器并行发送的平均请求数
			
 
				+AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
			
 
				+# 是否启用显示收到的每个响应的限制统计信息:
			
 
				+AUTOTHROTTLE_DEBUG = False
			
 
				+
			
 
				+# HTTP缓存
			
 
				+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
			
 
				+HTTPCACHE_ENABLED = True
			
 
				+HTTPCACHE_EXPIRATION_SECS = 0 # 有效时长
			
 
				+HTTPCACHE_DIR = 'httpcache'  # 存储路径 
			
 
				+HTTPCACHE_IGNORE_HTTP_CODES = [400,402,403,404,500,501,502,504,520]  # 忽略请求
			
 
				+HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'  #本地存储 
			
 
				+# HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage' # Splash的http缓存
			
 
				+# HTTPCACHE_GZIP = False # 压缩格式
			
 
				+
			
 
				+MONGO_DB_URI = "mongodb://localhost:27017"
			
 
				+MONGO_DB_NAME = "electric"
			
 
				+PUSH_URI = 'http://localhost:9999/talent/insertUrlTime'
			
 
				+
			
 
				+USER_AGENTS = [
			
 
				+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.55",
			
 
				+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36",
			
 
				+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0",
			
 
				+    # "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36",
			
 
				+    # "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
			
 
				+    # "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
			
 
				+    # "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
			
 
				+    # "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
			
 
				+    # "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
			
 
				+    # "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
			
 
				+    # "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
			
 
				+    # "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
			
 
				+    # "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
			
 
				+    # "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36",
			
 
				+    # "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
			
 
				+    # "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
			
 
				+    # "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
			
 
				+    # "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
			
 
				+    # "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
			
 
				+    # "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
			
 
				+    # "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
			
 
				+]
			
 
				+
			
 
				+PROXIES = [
			
 
				+    {'ip_port': ''},
			
 
				+]
			
--- a/projects/electric/electric/spiders/__init__.py
+++ b/projects/electric/electric/spiders/__init__.py
@@ -0,0 +1,5 @@
 
				+# -*- coding: utf-8 -*-

			
 
				+# @Author: sprivacy

			
 
				+# @Date:   2022-02-11 10:08:53

			
 
				+# @Last Modified by:   sprivacy

			
 
				+# @Last Modified time: 2022-02-11 10:08:53

			
--- a/projects/electric/electric/spiders/bjx.py
+++ b/projects/electric/electric/spiders/bjx.py
@@ -0,0 +1,133 @@
 
				+import time
			
 
				+
			
 
				+import scrapy
			
 
				+from electric.items import ElectricItem
			
 
				+# from scrapy import Request
			
 
				+# import scrapy_splash
			
 
				+# from scrapy_splash import SplashRequest
			
 
				+# from scrapy.shell import inspect_response
			
 
				+
			
 
				+
			
 
				+# 北极星
			
 
				+class BjxSpider(scrapy.Spider):
			
 
				+    name = 'bjx'
			
 
				+    download_delay = 10
			
 
				+    allowed_domains = ['bjx.com.cn']
			
 
				+    start_urls = [
			
 
				+        'https://www.bjx.com.cn',
			
 
				+    ]
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        for url in self.start_urls:
			
 
				+            yield scrapy.Request(url=url, callback=self.parse_class, errback=self.errback_httpbin, dont_filter=True)
			
 
				+            # yield SplashRequest(url=url, endpoint='render.html',
			
 
				+                                # args={"wait": 5, 'timeout': 90}, callback=self.parse_pages)
			
 
				+
			
 
				+    # 获取子站
			
 
				+    def parse_class(self, response):
			
 
				+        yield response.follow(url=response.css('a[title="风力发电"]::attr(href)').get(), callback=self.parse_fengdian, errback=self.errback_httpbin, meta={"column": '风电'}, dont_filter=True)
			
 
				+        yield response.follow(url=response.css('a[title="光伏太阳能"]::attr(href)').get(), callback=self.parse_fengdian, errback=self.errback_httpbin, meta={"column": '光伏'}, dont_filter=True)
			
 
				+        yield response.follow(url=response.css('a[title="储能"]::attr(href)').get(), callback=self.parse_chuneng, errback=self.errback_httpbin, meta={"column": '储能'}, dont_filter=True)
			
 
				+        yield response.follow(url=response.css('a[title="综合能源服务"]::attr(href)').get(), callback=self.parse_zhihui, errback=self.errback_httpbin, meta={"column": '智慧能源'}, dont_filter=True)
			
 
				+        yield response.follow(url=response.css('a[title="核电"]::attr(href)').get(), callback=self.parse_fengdian, errback=self.errback_httpbin, meta={"column": '核电'}, dont_filter=True)
			
 
				+        yield response.follow(url=response.css('a[title="电力软件"]::attr(href)').get(), callback=self.parse_software, errback=self.errback_httpbin, meta={"column": '计算机'}, dont_filter=True)
			
 
				+        yield response.follow(url=response.css('a[title="智能电网"]::attr(href)').get(), callback=self.parse_dianwang, errback=self.errback_httpbin, meta={"column": '智慧能源'}, dont_filter=True)
			
 
				+        yield response.follow(url=response.css('a[title="氢能"]::attr(href)').get(), callback=self.parse_fengdian, errback=self.errback_httpbin, meta={"column": '氢能'}, dont_filter=True)
			
 
				+        yield response.follow(url=response.css('a[title="分布式能源"]::attr(href)').get(), callback=self.parse_tanzichan, errback=self.errback_httpbin, meta={"column": '碳资产'}, dont_filter=True)
			
 
				+        yield response.follow(url=response.css('a[title="自动化"]::attr(href)').get(), callback=self.parse_auto, errback=self.errback_httpbin, meta={"column": '计算机'}, dont_filter=True)
			
 
				+        yield response.follow(url=response.css('a[title="电力通信"]::attr(href)').get(), callback=self.parse_msg, errback=self.errback_httpbin, meta={"column": '计算机'}, dont_filter=True)
			
 
				+
			
 
				+
			
 
				+    def parse_fengdian(self, response):
			
 
				+        for url in response.css(".left div .cc-section small a::attr(href)").getall():
			
 
				+            yield response.follow(url=url, callback=self.parse_pages_fengdian, errback=self.errback_httpbin, meta={"column": response.meta['column']}, dont_filter=True)
			
 
				+
			
 
				+    def parse_chuneng(self, response):
			
 
				+        for url in response.css('.center .active a::attr(href)').getall():
			
 
				+            yield response.follow(url=url, callback=self.parse_pages_fengdian, errback=self.errback_httpbin, meta={"column": response.meta['column']}, dont_filter=True)
			
 
				+
			
 
				+    def parse_zhihui(self, response):
			
 
				+        for url in response.css('.titled a::attr(href)').getall():
			
 
				+            yield response.follow(url=url, callback=self.parse_pages_zhihui, errback=self.errback_httpbin, meta={"column": response.meta['column']}, dont_filter=True)
			
 
				+
			
 
				+    def parse_software(self, response):
			
 
				+        for url in response.css('h3 span a::attr(href)').getall():
			
 
				+            yield response.follow(url=url, callback=self.parse_pages_software, errback=self.errback_httpbin, meta={"column": response.meta['column']}, dont_filter=True)
			
 
				+
			
 
				+    def parse_dianwang(self, response):
			
 
				+        for url in response.css('#menuItem_yw a::attr(href)').getall():
			
 
				+            yield response.follow(url=url, callback=self.parse_pages_software, errback=self.errback_httpbin, meta={"column": response.meta['column']}, dont_filter=True)
			
 
				+
			
 
				+    def parse_tanzichan(self, response):
			
 
				+        for url in response.css('.with-top-news a.more::attr(href)').getall():
			
 
				+            yield response.follow(url=url, callback=self.parse_pages_tanzichan, errback=self.errback_httpbin, meta={"column": response.meta['column']}, dont_filter=True)
			
 
				+
			
 
				+    def parse_auto(self, response):
			
 
				+        for url in response.css('.main2_leftbox span  a::attr(href)').getall():
			
 
				+            yield response.follow(url=url, callback=self.parse_pages_software, errback=self.errback_httpbin, meta={"column": response.meta['column']}, dont_filter=True)
			
 
				+
			
 
				+    def parse_msg(self, response):
			
 
				+        for url in response.css('.cc-list-content a::attr(href)').getall():
			
 
				+            yield response.follow(url=url, callback=self.parse_items_fengdian, errback=self.errback_httpbin, meta={"column": response.meta['column']}, dont_filter=True)
			
 
				+
			
 
				+    def parse_pages_fengdian(self, response):
			
 
				+        for url in response.css(".cc-list-content a::attr(href)").getall():
			
 
				+            yield response.follow(url=url, callback=self.parse_items_fengdian, errback=self.errback_httpbin, meta={"column": response.meta['column']})
			
 
				+        try:
			
 
				+            nextpage = response.css(".cc-paging a").re('<a href="(.*?)".*>下一页')[0]
			
 
				+            yield response.follow(url=nextpage, callback=self.parse_pages_fengdian, errback=self.errback_httpbin, meta={"column": response.meta['column']})
			
 
				+        except Exception as e:
			
 
				+            self.logger.info(e)
			
 
				+
			
 
				+    def parse_pages_zhihui(self, response):
			
 
				+        for url in response.css(".top a::attr(href)").getall():
			
 
				+            yield response.follow(url=url, callback=self.parse_items_fengdian, errback=self.errback_httpbin, meta={"column": response.meta['column']})
			
 
				+
			
 
				+    def parse_pages_software(self, response):
			
 
				+        for url in response.css('.list_left_ul a::attr(href)').getall():
			
 
				+            yield response.follow(url=url, callback=self.parse_items_fengdian, errback=self.errback_httpbin, meta={"column": response.meta['column']})
			
 
				+        try:
			
 
				+            nextpage = response.css(".page a[title]::attr(href)").get()
			
 
				+            yield response.follow(url=nextpage, callback=self.parse_pages_software, errback=self.errback_httpbin, meta={"column": response.meta['column']})
			
 
				+        except Exception as e:
			
 
				+            self.logger.info(e)
			
 
				+
			
 
				+    def parse_pages_tanzichan(self, response):
			
 
				+        for url in response.css('.news-list-ul a::attr(href)').getall():
			
 
				+            yield response.follow(url=url, callback=self.parse_items_fengdian, errback=self.errback_httpbin, meta={"column": response.meta['column']}, dont_filter=True)
			
 
				+
			
 
				+
			
 
				+
			
 
				+    def parse_items_fengdian(self, response):
			
 
				+        url = response.url
			
 
				+        title = response.css('title::text').get()
			
 
				+        source = '北极星电力软件网'
			
 
				+        description = response.css('meta[name=Description]::attr(content)').get()
			
 
				+        content = "".join(response.css(".cc-article p::text").getall())
			
 
				+        date = time.time()
			
 
				+        column = response.meta['column']
			
 
				+        yield ElectricItem(url=url, title=title, source=source,
			
 
				+                            description=description, content=content,
			
 
				+                            date=date, column=column)
			
 
				+
			
 
				+    def parse_items_zhihui(self, response):
			
 
				+        url = response.url
			
 
				+        title = response.css('title::text').get()
			
 
				+        source = '北极星'
			
 
				+        description = response.css('meta[name=Description]::attr(content)').get()
			
 
				+        try:
			
 
				+            content = "".join(response.css(".list_detail > p::text").getall())
			
 
				+        except:
			
 
				+            try:
			
 
				+                content = "".join(response.css('div.newsrand p::text').getall())
			
 
				+            except:
			
 
				+                content = ''
			
 
				+        date = time.time()
			
 
				+        column = response.meta['column']
			
 
				+        yield ElectricItem(url=url, title=title, source=source,
			
 
				+                            description=description, content=content,
			
 
				+                            date=date, column=column)
			
 
				+
			
 
				+
			
 
				+    def errback_httpbin(self, failure):
			
 
				+        self.logger.error(repr(failure))
			
--- a/projects/electric/electric/spiders/cecn.py
+++ b/projects/electric/electric/spiders/cecn.py
@@ -0,0 +1,75 @@
 
				+import re
			
 
				+import time
			
 
				+import scrapy
			
 
				+from electric.items import ElectricItem
			
 
				+# from scrapy import Request
			
 
				+# import scrapy_splash
			
 
				+# from scrapy_splash import SplashRequest
			
 
				+
			
 
				+
			
 
				+# 建设工程造价信息网
			
 
				+class CecnSpider(scrapy.Spider):
			
 
				+    name = 'cecn'
			
 
				+    download_delay = 10
			
 
				+    allowed_domains = ['cecn.org.cn']
			
 
				+    start_urls = [
			
 
				+    			'http://www.cecn.org.cn/NewList.asp?tid=1&searchstring=&ThisPage=1',
			
 
				+                'http://www.cecn.org.cn/NewList.asp?tid=5&searchstring=&ThisPage=1',
			
 
				+                'http://www.cecn.org.cn/NewList.asp?tid=13&searchstring=&ThisPage=1',
			
 
				+                'http://www.cecn.org.cn/NewList.asp?tid=15&searchstring=&ThisPage=1',
			
 
				+                'http://www.cecn.org.cn/NewList.asp?tid=19&searchstring=&ThisPage=1',
			
 
				+                'http://www.cecn.org.cn/NewList.asp?tid=21&searchstring=&ThisPage=1',
			
 
				+                'http://www.cecn.org.cn/NewList.asp?tid=28&searchstring=&ThisPage=1',
			
 
				+                'http://www.cecn.org.cn/NewList.asp?tid=29&searchstring=&ThisPage=1',
			
 
				+                'http://www.cecn.org.cn/NewList.asp?tid=30&searchstring=&ThisPage=1',
			
 
				+                'http://www.cecn.org.cn/NewList.asp?tid=31&searchstring=&ThisPage=1',
			
 
				+                'http://www.cecn.org.cn/NewList.asp?tid=40&searchstring=&ThisPage=1',
			
 
				+                'http://www.cecn.org.cn/NewList.asp?tid=41&searchstring=&ThisPage=1',
			
 
				+                'http://www.cecn.org.cn/NewList.asp?tid=42&searchstring=&ThisPage=1',
			
 
				+                'http://www.cecn.org.cn/NewList.asp?tid=43&searchstring=&ThisPage=1',
			
 
				+                'http://www.cecn.org.cn/NewList.asp?tid=44&searchstring=&ThisPage=1',
			
 
				+                'http://www.cecn.org.cn/NewList.asp?tid=45&searchstring=&ThisPage=1',
			
 
				+                'http://www.cecn.org.cn/NewList.asp?tid=46&searchstring=&ThisPage=1',
			
 
				+            ]
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        for url in self.start_urls:
			
 
				+            yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin, meta={"pagenum": 1}, dont_filter=True)
			
 
				+            # yield SplashRequest(url=url, endpoint='render.html',
			
 
				+            #                     args={"wait": 5, 'timeout': 90}, callback=self.parse_pages,
			
 
				+            #                     meta={"pagenum": 1})
			
 
				+
			
 
				+    def parse_pages(self, response):
			
 
				+        for url in response.css('ul.mew_list li a::attr(href)').getall():
			
 
				+            self.logger.info('item url: {}'.format(url))
			
 
				+            yield response.follow(url=url, callback=self.parse_items, errback=self.errback_httpbin, meta={"pagenum": None})
			
 
				+            # yield SplashRequest(url=url, endpoint='render.html',
			
 
				+            #                     args={"wait": 5, 'timeout': 90}, callback=self.parse_items,
			
 
				+            #                     meta={"date": date, "pagenum": None})
			
 
				+        try:
			
 
				+            if response.meta['pagenum'] == 1:
			
 
				+                nextpage = response.css('div.l_cont td > a:nth-child(1)::attr(href)').get()
			
 
				+            else:
			
 
				+                nextpage = response.css('div.l_cont td > a:nth-child(2)::attr(href)').get()
			
 
				+            self.logger.info('next page: {}'.format(nextpage))
			
 
				+            yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin, meta={"pagenum": None})
			
 
				+        except Exception as e:
			
 
				+            self.logger.info(e)
			
 
				+        # yield SplashRequest(url=url, endpoint='render.html',
			
 
				+        #                         args={"wait": 5, 'timeout': 90}, callback=self.parse_pages, meta={"pagenum": None})
			
 
				+
			
 
				+    def parse_items(self, response):
			
 
				+        url = response.url
			
 
				+        title = response.css('title::text').get()
			
 
				+        source = '建设工程造价信息网'
			
 
				+        description = ''
			
 
				+        content = "".join(re.sub(r'<.*?>', '', ''.join(response.css('#divBody').getall())).split())
			
 
				+        date = time.time()
			
 
				+        column = "建设工程造价信息"
			
 
				+        self.logger.info(url)
			
 
				+        yield ElectricItem(url=url, title=title, source=source,
			
 
				+                            description=description, content=content,
			
 
				+                            date=date, column=column)
			
 
				+
			
 
				+    def errback_httpbin(self, failure):
			
 
				+        self.logger.error(repr(failure))
			
--- a/projects/electric/electric/spiders/ceec.py
+++ b/projects/electric/electric/spiders/ceec.py
@@ -0,0 +1,50 @@
 
				+import time
			
 
				+import scrapy
			
 
				+from electric.items import ElectricItem
			
 
				+
			
 
				+
			
 
				+# 中国能源建设股份有限公司
			
 
				+# 员工风采
			
 
				+class CeecSpider(scrapy.Spider):
			
 
				+    name = 'ceec'
			
 
				+    # download_delay = 20
			
 
				+    allowed_domains = ['ceec.net.cn']
			
 
				+    start_urls = ['http://www.ceec.net.cn/col/col11016/index.html']
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        # for url in self.start_urls:
			
 
				+        #     self.logger.info('next page: {}'.format(url))
			
 
				+        #     yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin)
			
 
				+        yield scrapy.Request(url=self.start_urls[0], callback=self.parse_pages, errback=self.errback_httpbin, dont_filter=True)
			
 
				+        for inx in range(2, 78):
			
 
				+            url = 'http://www.ceec.net.cn/col/col11016/index.html?uid=410804&pageNum={}'.format(inx)
			
 
				+            self.logger.info('next page: {}'.format(url))
			
 
				+            yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin)
			
 
				+
			
 
				+    def parse_pages(self, response):
			
 
				+        # for item in response.xpath('//*[@id="410804"]//ul//li'):
			
 
				+        #     url = item.css('div.lanmu-txt > p > a').attrib['href']
			
 
				+        #     date = item.css('div.lanmu-time > p > span::text').get()
			
 
				+        #     self.logger.info('parse item: {}'.format(url))
			
 
				+        #     yield response.follow(url=url, callback=self.parse_items, errback=self.errback_httpbin, meta={"date": date})
			
 
				+        # nextpage = response.css('a[title=下一页]').attrib['href']
			
 
				+        # self.logger.info('next page: {}'.format(nextpage))
			
 
				+        # yield response.follow(url=nextpage, callback=self.parse_items, errback=self.errback_httpbin)
			
 
				+        for url in response.xpath('//*[@id="410804"]/script/text()').re(r'href=\"(http://www.ceec.net.cn/art.*?)\"'):
			
 
				+            yield response.follow(url=url, callback=self.parse_items, errback=self.errback_httpbin)
			
 
				+
			
 
				+    def parse_items(self, response):
			
 
				+        url = response.url
			
 
				+        title = response.css('title::text').get()
			
 
				+        source = '中国能源建设股份有限公司'
			
 
				+        description = response.css('meta[name=description]::attr(content)').get()
			
 
				+        content = ''.join(response.xpath('//div[@class="wz_article"]//p//text()').getall())
			
 
				+        date = time.time()
			
 
				+        column = ''
			
 
				+        self.logger.info(title)
			
 
				+        yield ElectricItem(url=url, title=title, source=source,
			
 
				+                            description=description, content=content,
			
 
				+                            date=date, column=column)
			
 
				+
			
 
				+    def errback_httpbin(self, failure):
			
 
				+        self.logger.error(repr(failure))
			
--- a/projects/electric/electric/spiders/ceeia.py
+++ b/projects/electric/electric/spiders/ceeia.py
@@ -0,0 +1,52 @@
 
				+import time
			
 
				+import scrapy
			
 
				+from electric.items import ElectricItem
			
 
				+
			
 
				+# 中国电器工业协会
			
 
				+class CeeiaSpider(scrapy.Spider):
			
 
				+    name = 'ceeia'
			
 
				+    allowed_domains = ['ceeia.com']
			
 
				+    start_urls = [
			
 
				+        ('http://www.ceeia.com/XHDT/{}.html', 49),    # 协会动态
			
 
				+        ('http://www.ceeia.com/HYZL/{}.html', 1206),  # 行业纵览
			
 
				+        ('http://www.ceeia.com/JSQY/{}.html', 347),   # 技术前沿
			
 
				+    ]
			
 
				+
			
 
				+    custom_settings = {
			
 
				+        'DOWNLOAD_DELAY': 10,
			
 
				+    }
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        for baseurl, maxpage in self.start_urls:
			
 
				+            url = baseurl.format(1)
			
 
				+            yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin, meta={"baseurl": baseurl, "nextpage": 2, "maxpage": maxpage})
			
 
				+
			
 
				+    def parse_pages(self, response):
			
 
				+        for url in response.css(".u-name a::attr(href)").getall():
			
 
				+            yield response.follow(url=url, callback=self.parse_items, errback=self.errback_httpbin)
			
 
				+        if response.meta['nextpage'] <= response.meta['maxpage']:
			
 
				+            nextpage = response.meta['baseurl'].format(response.meta['nextpage'])
			
 
				+            self.logger.info('next page: {}'.format(nextpage))
			
 
				+            yield response.follow(url=nextpage, 
			
 
				+                callback=self.parse_pages, 
			
 
				+                errback=self.errback_httpbin, 
			
 
				+                meta={"baseurl": response.meta['baseurl'], 
			
 
				+                    'nextpage': response.meta['nextpage'] + 1, 
			
 
				+                    'maxpage': response.meta['maxpage']
			
 
				+                })
			
 
				+
			
 
				+    def parse_items(self, response):
			
 
				+        url = response.url
			
 
				+        title = response.xpath('//div[@class="n-title"]/text()').get()
			
 
				+        source = '中国电器工业协会'
			
 
				+        description = ''
			
 
				+        content = ''.join(''.join(response.xpath('//div[@class="n-content"]//text()').getall()).split())
			
 
				+        date = time.time()
			
 
				+        column = ''
			
 
				+        self.logger.info('title: {}'.format(title))
			
 
				+        yield ElectricItem(url=url, title=title, source=source,
			
 
				+                            description=description, content=content,
			
 
				+                            date=date, column=column)
			
 
				+
			
 
				+    def errback_httpbin(self, failure):
			
 
				+        self.logger.error(repr(failure))
			
--- a/projects/electric/electric/spiders/chinanengyuan.py
+++ b/projects/electric/electric/spiders/chinanengyuan.py
@@ -0,0 +1,53 @@
 
				+import re
			
 
				+import time
			
 
				+import scrapy
			
 
				+from electric.items import ElectricItem
			
 
				+
			
 
				+# 新能源网
			
 
				+class CnySpider(scrapy.Spider):
			
 
				+    name = 'cny'
			
 
				+    allowed_domains = ['china-nengyuan.com']
			
 
				+    start_urls = [
			
 
				+        'http://www.china-nengyuan.com/news/news_list.php?keyword=太阳能',
			
 
				+        'http://www.china-nengyuan.com/news/news_list.php?keyword=风能',
			
 
				+        'http://www.china-nengyuan.com/news/news_list.php?keyword=氢能',
			
 
				+        'http://www.china-nengyuan.com/news/news_list.php?keyword=储能',
			
 
				+        'http://www.china-nengyuan.com/news/news_list.php?keyword=新材料',
			
 
				+    ]
			
 
				+
			
 
				+    custom_settings = {
			
 
				+        'DOWNLOAD_DELAY': 20,
			
 
				+        'ITEM_PIPELINES': {
			
 
				+            'electric.pipelines.PatentMongoDBPipeline': 300,
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        for url in self.start_urls:
			
 
				+            yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin, dont_filter=True)
			
 
				+
			
 
				+
			
 
				+    def parse_pages(self, response):
			
 
				+        for url in response.css('.member_tr_row a.blue::attr(href)').getall():
			
 
				+            yield response.follow(url=url, callback=self.parse_items, errback=self.errback_httpbin)
			
 
				+        try:
			
 
				+            nextpage = response.css('table.membertable_page:nth-of-type(3) a:nth-of-type(1)::attr(href)').get()
			
 
				+            yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin)
			
 
				+        except:
			
 
				+            self.logger.info('Last page')
			
 
				+
			
 
				+    def parse_items(self, response):
			
 
				+        url = response.url
			
 
				+        title = response.css("title::text").get()
			
 
				+        source = '新能源网'
			
 
				+        description = ''
			
 
				+        content =  "".join("".join(response.css(".f16 p::text").getall()).split())
			
 
				+        date = time.time()
			
 
				+        column = ''
			
 
				+        self.logger.info('title: {}'.format(title))
			
 
				+        yield ElectricItem(url=url, title=title, source=source,
			
 
				+                            description=description, content=content,
			
 
				+                            date=date, column=column)
			
 
				+            
			
 
				+    def errback_httpbin(self, failure):
			
 
				+        self.logger.error(repr(failure))
			
--- a/projects/electric/electric/spiders/chinapower.py
+++ b/projects/electric/electric/spiders/chinapower.py
@@ -0,0 +1,53 @@
 
				+# !/usr/bin/python
			
 
				+# -*- coding: utf-8 -*-
			
 
				+# @Author: sprivacy
			
 
				+# @Date:   2022-05-13 16:54:47
			
 
				+# @Last Modified by:   sprivacy
			
 
				+# @Last Modified time: 2022-05-13 18:05:04
			
 
				+# 中国电力
			
 
				+
			
 
				+import re
			
 
				+import time
			
 
				+import scrapy
			
 
				+from electric.items import ElectricItem
			
 
				+
			
 
				+class BccnSpider(scrapy.Spider):
			
 
				+    name = 'chinapower'
			
 
				+    download_delay = 20
			
 
				+    allowed_domains = ['chinapower.com.cn']
			
 
				+    start_urls = [
			
 
				+                'http://b2b.chinapower.com.cn/news/',
			
 
				+            ]
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        for url in self.start_urls:
			
 
				+            yield scrapy.Request(url=url, callback=self.parse_class, errback=self.errback_httpbin, dont_filter=True)
			
 
				+
			
 
				+    def  parse_class(self, response):
			
 
				+        for classes in response.css(".head-txt a::attr(href)").getall():
			
 
				+            yield response.follow(url=url, callback=self.parse_pages, errback=self.errback_httpbin, dont_filter=True)
			
 
				+
			
 
				+    def parse_pages(self, response):
			
 
				+        for item in response.xpath('//div[@class="catlist"]/ul//li[not(@class="sp")]'):
			
 
				+            url = item.css('a').attrib['href']
			
 
				+            self.logger.info('parse item: {}'.format(url))
			
 
				+            yield response.follow(url=url, callback=self.parse_items, errback=self.errback_httpbin)
			
 
				+        nextpage = response.css("div.pages a::attr(href)").getall()[-1]
			
 
				+        self.logger.info('next page: {}'.format(nextpage))
			
 
				+        yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin, dont_filter=True)
			
 
				+
			
 
				+    def parse_items(self, response):
			
 
				+        url = response.url
			
 
				+        title = response.css('#title::text').get()
			
 
				+        source = '中国电力'
			
 
				+        description = response.css('meta[name=description]::attr(content)').get()
			
 
				+        content = "".join(re.sub(r'<.*?>', '', response.css('div.content').get()).split())
			
 
				+        date = time.time()
			
 
				+        column = ''
			
 
				+        self.logger.info(title)
			
 
				+        yield ElectricItem(url=url, title=title, source=source,
			
 
				+                            description=description, content=content,
			
 
				+                            date=date, column=column)
			
 
				+
			
 
				+    def errback_httpbin(self, failure):
			
 
				+        self.logger.error(repr(failure))
			
--- a/projects/electric/electric/spiders/chinapv.py
+++ b/projects/electric/electric/spiders/chinapv.py
@@ -0,0 +1,45 @@
 
				+import time
			
 
				+import scrapy
			
 
				+from electric.items import ElectricItem
			
 
				+
			
 
				+# 中国光伏行业协会
			
 
				+class CpvSpider(scrapy.Spider):
			
 
				+    name = 'chinapv'
			
 
				+    allowed_domains = ['chinapv.org.cn']
			
 
				+    start_urls = ['http://www.chinapv.org.cn/association_news.html']
			
 
				+
			
 
				+    custom_settings = {
			
 
				+        'DOWNLOAD_DELAY': 10,
			
 
				+    }
			
 
				+    
			
 
				+    def start_requests(self):
			
 
				+        for url in self.start_urls:
			
 
				+            yield scrapy.Request(url=url,
			
 
				+                callback=self.parse_pages,
			
 
				+                errback=self.errback_httpbin,
			
 
				+                headers={"Host":'www.chinapv.org.cn',"Refer":'http://www.chinapv.org.cn/association_news.html'})
			
 
				+
			
 
				+    def parse_pages(self, response):
			
 
				+        for url in response.css('.text > a::attr(href)').getall():
			
 
				+            yield response.follow(url=url, callback=self.parse_item, errback=self.errback_httpbin)
			
 
				+        try:
			
 
				+            nextpage = response.css('a.next::attr(href)').get()
			
 
				+            yield response.follow(url=url, callback=self.parse_pages, errback=self.errback_httpbin)
			
 
				+        except:
			
 
				+            self.logger.info('Last page')
			
 
				+
			
 
				+    def parse_item(self, response):
			
 
				+        url = response.url
			
 
				+        title = response.xpath('//div[@class="title"]/h1/text()').get()
			
 
				+        source = '中国光伏行业协会'
			
 
				+        description = ''
			
 
				+        content = ''.join(''.join(response.xpath('//div[@class="industry_mapxx1"]/div[@class="cont"]/p//text()').getall()).split())
			
 
				+        date = time.time()
			
 
				+        column = '光伏'
			
 
				+        self.logger.info('title: {}'.format(title))
			
 
				+        yield ElectricItem(url=url, title=title, source=source,
			
 
				+                            description=description, content=content,
			
 
				+                            date=date, column=column)
			
 
				+
			
 
				+    def errback_httpbin(self, failure):
			
 
				+        self.logger.error(repr(failure))
			
--- a/projects/electric/electric/spiders/chng.py
+++ b/projects/electric/electric/spiders/chng.py
@@ -0,0 +1,57 @@
 
				+import time
			
 
				+import scrapy
			
 
				+from electric.items import ElectricItem
			
 
				+
			
 
				+
			
 
				+# 中国华能集团公司
			
 
				+# 集团要闻、领导活动、一线新闻、媒体报道、国资网讯
			
 
				+class ChngSpider(scrapy.Spider):
			
 
				+    name = 'chng'
			
 
				+    download_delay = 14
			
 
				+    allowed_domains = ['chng.com.cn']
			
 
				+    start_urls = [
			
 
				+        'https://www.chng.com.cn/list_ldhd/-/article/cWFTITGYzrws/list/23209.html',
			
 
				+        'https://www.chng.com.cn/list_yxxw/-/article/7tNnqgwRLpoA/list/23219.html',
			
 
				+        'https://www.chng.com.cn/list_mtbd/-/article/whVXc9vlCPOV/list/23224.html',
			
 
				+        'https://www.chng.com.cn/list_gzwx/-/article/Ze8xCBHM8In0/list/23229.html',
			
 
				+
			
 
				+    ]
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        for url in ['https://www.chng.com.cn/list_jtyw/-/article/vUfd4jOBhajJ/list/23204.html']:
			
 
				+            yield scrapy.Request(url=url, callback=self.parse_pages1, errback=self.errback_httpbin, dont_filter=True)
			
 
				+        for url in self.start_urls:
			
 
				+            yield scrapy.Request(url=url, callback=self.parse_pages2, errback=self.errback_httpbin, dont_filter=True)
			
 
				+
			
 
				+    def parse_pages1(self, response):
			
 
				+        for item in response.css('div.impNews-content div'):
			
 
				+            urls = item.css('div.atvimg::attr(onclick)').re(r'javascript:window.open\(\'(.*?)\'\)')
			
 
				+            for url in urls:
			
 
				+                self.logger.info('parse item: {}'.format(url))
			
 
				+                yield response.follow(url=url, callback=self.parse_items, errback=self.errback_httpbin)
			
 
				+
			
 
				+    def parse_pages2(self, response):
			
 
				+        for item in response.css('div.leaderShip-content div.news-list-item'):
			
 
				+            url = item.css('div.news-list-con::attr(onclick)').re(r'javascript:window.open\(\'(.*?)\'\)')[0]
			
 
				+            self.logger.info('parse item: {}'.format(url))
			
 
				+            yield response.follow(url=url, callback=self.parse_items, errback=self.errback_httpbin)
			
 
				+        nextpage = response.css('a.layui-laypage-next').attrib['href']
			
 
				+        self.logger.info('next page: {}'.format(nextpage))
			
 
				+        if nextpage:
			
 
				+            yield response.follow(url=nextpage, callback=self.parse_pages2, errback=self.errback_httpbin, dont_filter=True)
			
 
				+
			
 
				+    def parse_items(self, response):
			
 
				+        url = response.url
			
 
				+        title = response.css('title::text').get()
			
 
				+        source = '中国华能集团公司'
			
 
				+        description = ''
			
 
				+        content = ''.join("".join(response.xpath('//div[@class="detail-article"]//text()').getall()).split())
			
 
				+        date = time.time()
			
 
				+        column = ''
			
 
				+        self.logger.info(title)
			
 
				+        yield ElectricItem(url=url, title=title, source=source,
			
 
				+                            description=description, content=content,
			
 
				+                            date=date, column=column)
			
 
				+
			
 
				+    def errback_httpbin(self, failure):
			
 
				+        self.logger.error(repr(failure))
			
--- a/projects/electric/electric/spiders/cnenergynews.py
+++ b/projects/electric/electric/spiders/cnenergynews.py
@@ -0,0 +1,57 @@
 
				+import time
			
 
				+import datetime
			
 
				+import scrapy
			
 
				+from electric.items import ElectricItem
			
 
				+
			
 
				+
			
 
				+# 中国能源网
			
 
				+# 地方、电网、电力、风电、光伏、储能、氢能、环保、访谈、科技装备、新能源汽车、能源互联
			
 
				+class CnenSpider(scrapy.Spider):
			
 
				+    name = 'cnen'
			
 
				+    download_delay = 20
			
 
				+    allowed_domains = ['cnenergynews.cn']
			
 
				+    baseurls = [
			
 
				+        'http://cnenergynews.cn/js/88/mi4_sub_articles_{}.js?v=20220221104644',
			
 
				+        'http://cnenergynews.cn/js/670/mi4_sub_articles_{}.js?v=20220216162004',
			
 
				+        'http://cnenergynews.cn/js/81/mi4_sub_articles_{}.js?v=20220217094205',
			
 
				+        'http://cnenergynews.cn/js/94/mi4_sub_articles_{}.js?v=20220215141813',
			
 
				+        'http://cnenergynews.cn/js/82/mi4_sub_articles_{}.js?v=20220217094755',
			
 
				+        'http://cnenergynews.cn/js/95/mi4_sub_articles_{}.js?v=20220217094454',
			
 
				+        'http://cnenergynews.cn/js/96/mi4_sub_articles_{}.js?v=20220216145517',
			
 
				+        'http://cnenergynews.cn/js/85/mi4_sub_articles_{}.js?v=20220217101542',
			
 
				+        'http://cnenergynews.cn/js/84/mi4_sub_articles_{}.js?v=20220207111433',
			
 
				+        'http://cnenergynews.cn/js/450/mi4_sub_articles_{}.js?v=20220212161250',
			
 
				+        'http://cnenergynews.cn/js/98/mi4_sub_articles_{}.js?v=20220217093835',
			
 
				+        'http://cnenergynews.cn/js/112/mi4_sub_articles_{}.js?v=20220207111433',
			
 
				+    ]
			
 
				+
			
 
				+    custom_settings = {
			
 
				+        'CONCURRENT_REQUESTS_PER_IP': 1,
			
 
				+        'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
			
 
				+        'DOWNLOAD_TIMEOUT': 2,
			
 
				+    }
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        for index in range(1, 300):
			
 
				+            for baseurl in self.baseurls:
			
 
				+                format_time = (datetime.datetime.now()+datetime.timedelta(days=-index)).strftime("%Y%m%d")
			
 
				+                url = baseurl.format(format_time)
			
 
				+                self.logger.info('next url: {}'.format(url))
			
 
				+                yield scrapy.Request(url=url, callback=self.parse_items, errback=self.errback_httpbin)
			
 
				+
			
 
				+    def parse_items(self, response):
			
 
				+        for item in eval(response.text[65:]):
			
 
				+            url = item['url']
			
 
				+            title = item['title']
			
 
				+            source = '中国能源网'
			
 
				+            description = item['miSummary']
			
 
				+            content = item['miContentTxt']
			
 
				+            date = time.time()
			
 
				+            column = ''
			
 
				+            self.logger.info(title)
			
 
				+            yield ElectricItem(url=url, title=title, source=source,
			
 
				+                            description=description, content=content,
			
 
				+                            date=date, column=column)
			
 
				+
			
 
				+    def errback_httpbin(self, failure):
			
 
				+        self.logger.error(repr(failure))
			
--- a/projects/electric/electric/spiders/cnnpn.py
+++ b/projects/electric/electric/spiders/cnnpn.py
@@ -0,0 +1,62 @@
 
				+import time
			
 
				+import re
			
 
				+import scrapy
			
 
				+# from scrapy import Request
			
 
				+# import scrapy_splash
			
 
				+# from scrapy_splash import SplashRequest
			
 
				+
			
 
				+from electric.items import ElectricItem
			
 
				+
			
 
				+# from scrapy.shell import inspect_response
			
 
				+# from scrapy.utils.response import open_in_browser
			
 
				+
			
 
				+# 中国核电网
			
 
				+class CnnpnSpider(scrapy.Spider):
			
 
				+    name = 'cnnpn'
			
 
				+    download_delay = 16
			
 
				+    allowed_domains = ['cnnpn.cn']
			
 
				+    start_urls = [
			
 
				+        'https://www.cnnpn.cn/channel/1.html',
			
 
				+        'https://www.cnnpn.cn/channel/3.html',
			
 
				+        'https://www.cnnpn.cn/channel/4.html',
			
 
				+    ]
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        for url in self.start_urls:
			
 
				+            yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin)
			
 
				+            # yield SplashRequest(url=url, endpoint='render.html',
			
 
				+            #                     args={"wait": 5, 'timeout': 90}, callback=self.parse_pages)
			
 
				+
			
 
				+    def parse_pages(self, response):
			
 
				+        for url in response.css('#news-lists li a::attr(href)').getall():
			
 
				+            self.logger.info('item url: {}'.format(url))
			
 
				+            yield response.follow(url=url, callback=self.parse_items, errback=self.errback_httpbin)
			
 
				+            # yield SplashRequest(url=url, endpoint="render.html",
			
 
				+            #                     args={"wait": 5, 'timeout': 90}, callback=self.parse_items,
			
 
				+            #                     meta={"date": date})
			
 
				+        try:
			
 
				+            url = response.css('div.pagecode a:nth-last-child(1)::attr(href)').get()
			
 
				+            self.logger.info('next page: {}'.format(url))
			
 
				+            yield response.follow(url=url, callback=self.parse_pages, errback=self.errback_httpbin)
			
 
				+        except:
			
 
				+            self.logger.info('Last page')
			
 
				+        # yield SplashRequest(url=url, endpoint='render.html',
			
 
				+        #                     args={"wait": 5, 'timeout': 90}, callback=self.parse_pages)
			
 
				+
			
 
				+    def parse_items(self, response):
			
 
				+        url = response.url
			
 
				+        source = '中国核电网'
			
 
				+        description = ''
			
 
				+        date = time.time()
			
 
				+        column = '核电'
			
 
				+        try:
			
 
				+            title = response.css('title::text').get()
			
 
				+            content = "".join(re.sub(r"<.*?>", "", response.css("div.content").get()).split())
			
 
				+            yield ElectricItem(url=url, title=title, source=source,
			
 
				+                                description=description, content=content,
			
 
				+                                date=date, column=column)
			
 
				+        except Exception as e:
			
 
				+            self.logger.warning(e)
			
 
				+
			
 
				+    def errback_httpbin(self, failure):
			
 
				+        self.logger.error(repr(failure))
			
--- a/projects/electric/electric/spiders/cpnn.py
+++ b/projects/electric/electric/spiders/cpnn.py
@@ -0,0 +1,80 @@
 
				+import time
			
 
				+import re
			
 
				+import scrapy
			
 
				+# from scrapy_splash import SplashRequest
			
 
				+
			
 
				+from electric.items import ElectricItem
			
 
				+
			
 
				+
			
 
				+# 中国电力报
			
 
				+# 科技、网上国网、基建、人物
			
 
				+class CpnnSpider(scrapy.Spider):
			
 
				+    name = 'cpnn'
			
 
				+    download_delay = 15
			
 
				+    allowed_domains = ['cpnn.com.cn']
			
 
				+    start_urls = [
			
 
				+        'http://www.cpnn.com.cn/news/nytt/',
			
 
				+        'http://www.cpnn.com.cn/news/nyjg/',
			
 
				+        'http://www.cpnn.com.cn/news/nyqy/',
			
 
				+        'http://www.cpnn.com.cn/news/dfny/',
			
 
				+        'http://www.cpnn.com.cn/news/hg/',
			
 
				+        'http://www.cpnn.com.cn/news/hy/',
			
 
				+        'http://www.cpnn.com.cn/news/kj/',
			
 
				+        'http://www.cpnn.com.cn/news/nygm/',
			
 
				+        'http://www.cpnn.com.cn/news/tdftzh/',
			
 
				+        'http://www.cpnn.com.cn/news/xny/',
			
 
				+        'http://www.cpnn.com.cn/news/xxdlxt/',
			
 
				+        'http://www.cpnn.com.cn/dianli/dangjian/',
			
 
				+        'http://www.cpnn.com.cn/shouye/sylm/wsgw/',
			
 
				+        'http://www.cpnn.com.cn/qiye/shizheng/',
			
 
				+        'http://www.cpnn.com.cn/qiye/yingcai/',
			
 
				+        'http://www.cpnn.com.cn/qiye/rongyu/',
			
 
				+        
			
 
				+    ]
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        for url in self.start_urls:
			
 
				+            yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin, dont_filter=True)
			
 
				+        for url in ['http://www.cpnn.com.cn/dianli/dljs/dljsyw/',]:
			
 
				+            yield scrapy.Request(url=url, callback=self.parse_pages1, errback=self.errback_httpbin, dont_filter=True)
			
 
				+
			
 
				+    def parse_pages(self, response):
			
 
				+        for url in response.css("div.cpnnlist div.fl ul li a::attr(href)").getall():
			
 
				+            yield response.follow(url=url, dont_filter=False, callback=self.parse_items, errback=self.errback_httpbin)
			
 
				+        nextpage = response.css('div.page a').re(r'href="(.*?)">下一页')[0]
			
 
				+        self.logger.info('next page: {}'.format(nextpage))
			
 
				+        yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin)
			
 
				+
			
 
				+    def parse_pages1(self, response):
			
 
				+        for url in response.css('a.fl::attr(href)').getall():
			
 
				+            yield response.follow(url=url, dont_filter=False, callback=self.parse_items1, errback=self.errback_httpbin)
			
 
				+        nextpage = response.css('div.page a').re(r'href="(.*?)">下一页')[0]
			
 
				+        self.logger.info('next page: {}'.format(nextpage))
			
 
				+        yield response.follow(url=nextpage, callback=self.parse_pages1, errback=self.errback_httpbin)
			
 
				+
			
 
				+    def parse_items1(self, response):
			
 
				+        url = response.url
			
 
				+        title = response.css("h3::text").get()
			
 
				+        source = '中国电力报'
			
 
				+        description = ''
			
 
				+        content = "".join(re.sub(r"<.*?>", "", "".join(response.css(".detail-box p:nth-of-type(n+2)").getall())).split())
			
 
				+        date = time.time()
			
 
				+        column = ''
			
 
				+        yield ElectricItem(url=url, title=title, source=source,
			
 
				+                            description=description, content=content,
			
 
				+                            date=date, column=column)
			
 
				+
			
 
				+    def parse_items(self, response):
			
 
				+        url = response.url
			
 
				+        title = response.css("h3::text").get()
			
 
				+        source = '中国电力报'
			
 
				+        description = response.css('meta[name=Description]::attr(content)').get()
			
 
				+        content = ''.join(''.join(response.css("div.cpnnlist_l p::text").getall()).split())
			
 
				+        date = time.time()
			
 
				+        column = ''
			
 
				+        yield ElectricItem(url=url, title=title, source=source,
			
 
				+                            description=description, content=content,
			
 
				+                            date=date, column=column)
			
 
				+
			
 
				+    def errback_httpbin(self, failure):
			
 
				+        self.logger.error(repr(failure))
			
--- a/projects/electric/electric/spiders/csg.py
+++ b/projects/electric/electric/spiders/csg.py
@@ -0,0 +1,53 @@
 
				+from datetime import datetime
			
 
				+import time
			
 
				+import scrapy
			
 
				+from electric.items import ElectricItem
			
 
				+
			
 
				+
			
 
				+# 南方电网报
			
 
				+# 公司要闻、高管动态、一线传真、图说南网、媒体关注
			
 
				+class CsgSpider(scrapy.Spider):
			
 
				+    name = 'csg'
			
 
				+    download_delay = 15
			
 
				+    allowed_domains = ['csg.cn']
			
 
				+    start_urls = [
			
 
				+        'https://www.csg.cn/xwzx/{}/gsyw/',
			
 
				+        'https://www.csg.cn/xwzx/{}/gcdt/',
			
 
				+        'https://www.csg.cn/xwzx/{}/yxcz/',
			
 
				+        'https://www.csg.cn/xwzx/{}/tsnw/',
			
 
				+        'https://www.csg.cn/xwzx/{}/mtgz/',
			
 
				+    ]
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        for url in self.start_urls:
			
 
				+            for year in range(datetime.now().year, 2006, -1):
			
 
				+                url = url.format(year)
			
 
				+                yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin)
			
 
				+
			
 
				+    def parse_pages(self, response):
			
 
				+        for url in response.css('div.list-news a::attr(href)').getall():
			
 
				+            yield response.follow(url=url, callback=self.parse_items, errback=self.errback_httpbin)
			
 
				+        pagecount, pagecur = response.css('#page script').re(r'createPageHTMLV2\((\d+), (\d+)')
			
 
				+        pagecount = eval(pagecount)
			
 
				+        pagecur = eval(pagecur)
			
 
				+        if pagecur + 1 < pagecount:
			
 
				+            if pagecur == 0:
			
 
				+                nextpage = response.url + 'index_1.html'
			
 
				+            else:
			
 
				+                nextpage = response.url[:-12] + 'index_{}.html'.format(pagecur+1)
			
 
				+            yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin)
			
 
				+
			
 
				+    def parse_items(self, response):
			
 
				+        url = response.url
			
 
				+        title = response.css('title::text').get()
			
 
				+        source = '南方电网报'
			
 
				+        description = response.css('meta[name=description]::attr(content)').get()
			
 
				+        content = "".join("".join(response.css("div.TRS_Editor p::text").getall()).split())
			
 
				+        date = time.time()
			
 
				+        column = ''
			
 
				+        yield ElectricItem(url=url, title=title, source=source,
			
 
				+                            description=description, content=content,
			
 
				+                            date=date, column=column)
			
 
				+
			
 
				+    def errback_httpbin(self, failure):
			
 
				+        self.logger.error(repr(failure))
			
--- a/projects/electric/electric/spiders/ctg.py
+++ b/projects/electric/electric/spiders/ctg.py
@@ -0,0 +1,61 @@
 
				+import time
			
 
				+import scrapy
			
 
				+from electric.items import ElectricItem
			
 
				+
			
 
				+
			
 
				+# 三峡集团
			
 
				+# 头条新闻、集团要闻、综合新闻
			
 
				+class CtgSpider(scrapy.Spider):
			
 
				+    name = 'ctg'
			
 
				+    download_delay = 16
			
 
				+    allowed_domains = ['ctg.com.cn']
			
 
				+    start_urls = [
			
 
				+        ('https://www.ctg.com.cn/eportal/ui?pageId=721173&currentPage={}&moduleId=47eb7057d63f4966a926bfa6fedc3648&staticRequest=yes', 791),
			
 
				+        ('https://www.ctg.com.cn/eportal/ui?pageId=721176&currentPage={}&moduleId=ddb9046bc9ad46aba628ba0bcd74aea0&staticRequest=yes', 2798),
			
 
				+        ('https://www.ctg.com.cn/eportal/ui?pageId=721179&currentPage={}&moduleId=5a38941454f8489b82222bf8308eaf87&staticRequest=yes', 2029),
			
 
				+        ]
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        for url, pagemax in [('https://www.ctg.com.cn/eportal/ui?pageId=721194&currentPage={}&moduleId=19e689e3ca304e74a9a16e2a046bc9d8&staticRequest=yes', 219)]:
			
 
				+            pagecur = 1
			
 
				+            starturl = url.format(pagecur)
			
 
				+            yield scrapy.Request(url=starturl, callback=self.parse_pages1, errback=self.errback_httpbin, meta={"base_url":url, "pagecur": pagecur, "pagemax": pagemax}, dont_filter=True)
			
 
				+        for url, pagemax in self.start_urls:
			
 
				+            pagecur = 1
			
 
				+            starturl = url.format(pagecur)
			
 
				+            yield scrapy.Request(url=starturl, callback=self.parse_pages, errback=self.errback_httpbin, meta={"base_url":url, "pagecur": pagecur, "pagemax": pagemax}, dont_filter=True)
			
 
				+
			
 
				+    def parse_pages1(self, response):
			
 
				+        for url in response.css('.content_ztbd a::attr(href)').getall():
			
 
				+            yield response.follow(url=url, callback=self.parse_items, errback=self.errback_httpbin)
			
 
				+        if response.meta['pagecur'] < response.meta['pagemax']:
			
 
				+            nextpage = response.meta['base_url'].format(response.meta['pagecur'] + 1)
			
 
				+            self.logger.info('next page: {}'.format(nextpage))
			
 
				+            yield response.follow(url=nextpage, callback=self.parse_pages1, errback=self.errback_httpbin, meta={"base_url": response.meta['base_url'], "pagecur":
			
 
				+                                response.meta['pagecur'] + 1, "pagemax": response.meta['pagemax']})
			
 
				+
			
 
				+    def parse_pages(self, response):
			
 
				+        for item in response.css('div.content_zhxw'):
			
 
				+            url = item.css('h4 a').attrib['href']
			
 
				+            yield response.follow(url=url, callback=self.parse_items, errback=self.errback_httpbin)
			
 
				+        if response.meta['pagecur'] < response.meta['pagemax']:
			
 
				+            nextpage = response.meta['base_url'].format(response.meta['pagecur'] + 1)
			
 
				+            self.logger.info('next page: {}'.format(nextpage))
			
 
				+            yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin, meta={"base_url": response.meta['base_url'], "pagecur":
			
 
				+                                response.meta['pagecur'] + 1, "pagemax": response.meta['pagemax']})
			
 
				+
			
 
				+    def parse_items(self, response):
			
 
				+        url = response.url
			
 
				+        title = response.css('title::text').get()
			
 
				+        source = '中国长江三峡集团'
			
 
				+        description = response.css('meta[name=description]::attr(content)').get()
			
 
				+        content = ''.join(''.join(response.xpath('//div[@class="content_skdb"]//p//text()').getall()).split())
			
 
				+        date = time.time()
			
 
				+        column = ""
			
 
				+        self.logger.info(title)
			
 
				+        yield ElectricItem(url=url, title=title, source=source,
			
 
				+                            description=description, content=content,
			
 
				+                            date=date, column=column)
			
 
				+
			
 
				+    def errback_httpbin(self, failure):
			
 
				+        self.logger.error(repr(failure))
			
--- a/projects/electric/electric/spiders/cweea.py
+++ b/projects/electric/electric/spiders/cweea.py
@@ -0,0 +1,41 @@
 
				+import time
			
 
				+import scrapy
			
 
				+from electric.items import ElectricItem
			
 
				+
			
 
				+# 风能产业网
			
 
				+class CweeaSpider(scrapy.Spider):
			
 
				+    name = 'cweea'
			
 
				+    allowed_domains = ['cweea.com.cn']
			
 
				+    start_urls = ['http://www.cweea.com.cn/xwdt/xkdt/']
			
 
				+
			
 
				+    custom_settings = {
			
 
				+        'DOWNLOAD_DELAY': 10,
			
 
				+    }
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        for url in self.start_urls:
			
 
				+            yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin)
			
 
				+
			
 
				+    def parse_pages(self, response):
			
 
				+        for url in response.css('a.point::attr(href)').getall():
			
 
				+            yield response.follow(url=url, callback=self.parse_item, errback=self.errback_httpbin)
			
 
				+        try:
			
 
				+            nextpage = response.xpath('//span[@class="JZD_PAGE_NEXT"]/a').attrib['href']
			
 
				+            yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin)
			
 
				+        except:
			
 
				+            self.logger.info('Last page')
			
 
				+
			
 
				+    def parse_item(self, response):
			
 
				+        url = response.url
			
 
				+        title = response.css('title::text').get()
			
 
				+        source = '风能产业网'
			
 
				+        description = ''
			
 
				+        content = ''.join(''.join(response.xpath('//div[@class="info"]/p/span/text()').getall()).split())
			
 
				+        date = time.time()
			
 
				+        column = '风能'
			
 
				+        yield ElectricItem(url=url, title=title, source=source,
			
 
				+                            description=description, content=content,
			
 
				+                            date=date, column=column)
			
 
				+
			
 
				+    def errback_httpbin(self, failure):
			
 
				+        self.logger.error(repr(failure))
			
--- a/projects/electric/electric/spiders/eptchina.py
+++ b/projects/electric/electric/spiders/eptchina.py
@@ -0,0 +1,48 @@
 
				+import time
			
 
				+import scrapy
			
 
				+from electric.items import ElectricItem
			
 
				+
			
 
				+
			
 
				+# 电力科技网
			
 
				+# 核电、热电、电网、节能、科技
			
 
				+class EptcSpider(scrapy.Spider):
			
 
				+    name = 'eptc'
			
 
				+    download_delay = 20
			
 
				+    allowed_domains = ['eptchina.com']
			
 
				+    start_urls = [
			
 
				+        'http://www.eptchina.com/news/list-16.html',
			
 
				+        'http://www.eptchina.com/news/list-17.html',
			
 
				+        'http://www.eptchina.com/news/list-20.html',
			
 
				+        'http://www.eptchina.com/news/list-19.html',
			
 
				+        'http://www.eptchina.com/news/list-7.html',
			
 
				+    ]
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        for url in self.start_urls:
			
 
				+            yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin)
			
 
				+
			
 
				+    def parse_pages(self, response):
			
 
				+        for item in response.css('div.articlelist ul li'):
			
 
				+            url = item.css('a').attrib['href']
			
 
				+            yield response.follow(url=url, callback=self.parse_items, errback=self.errback_httpbin)
			
 
				+        try:
			
 
				+            nextpage = response.css('div.page a::attr(href)').getall()[-1]
			
 
				+            yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin)
			
 
				+        except:
			
 
				+            self.logger.info('Last page')
			
 
				+
			
 
				+    def parse_items(self, response):
			
 
				+        url = response.url
			
 
				+        title = response.css('title::text').get()
			
 
				+        source = '电力科技网'
			
 
				+        description = response.css('meta[name=description]::attr(content)').get()
			
 
				+        content = ''.join(''.join(response.xpath('//*[@id="ctrlfscont"]/div//text()').getall()).split())
			
 
				+        date = time.time()
			
 
				+        column = ''
			
 
				+        self.logger.info(title)
			
 
				+        yield ElectricItem(url=url, title=title, source=source,
			
 
				+                            description=description, content=content,
			
 
				+                            date=date, column=column)
			
 
				+
			
 
				+    def errback_httpbin(self, failure):
			
 
				+        self.logger.error(repr(failure))
			
--- a/projects/electric/electric/spiders/escn.py
+++ b/projects/electric/electric/spiders/escn.py
@@ -0,0 +1,45 @@
 
				+import time
			
 
				+import scrapy
			
 
				+from electric.items import ElectricItem
			
 
				+
			
 
				+
			
 
				+# 中国储能网
			
 
				+class EscnSpider(scrapy.Spider):
			
 
				+    name = 'escn'
			
 
				+    download_delay = 15
			
 
				+    allowed_domains = ['escn.com.cn']
			
 
				+    start_urls = ['http://www.escn.com.cn/']
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        for url in self.start_urls:
			
 
				+            yield scrapy.Request(url=url, callback=self.parse_classes, errback=self.errback_httpbin, dont_filter=True)
			
 
				+
			
 
				+    def parse_classes(self, response):
			
 
				+        for url in response.css('.left-auto a.s_more::attr(href)').getall():
			
 
				+            yield response.follow(url=url, callback=self.parse_pages, errback=self.errback_httpbin, dont_filter=True)
			
 
				+
			
 
				+    def parse_pages(self, response):
			
 
				+        for url in response.css('.n-onelist a::attr(href)').getall():
			
 
				+            yield response.follow(url=url, callback=self.parse_items, errback=self.errback_httpbin)
			
 
				+        try:
			
 
				+            nextpage = response.css('div.digg a').re(r'href="(.*?)">下一页')[0]
			
 
				+            self.logger.info('next page: {}'.format(nextpage))
			
 
				+            yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin)
			
 
				+        except:
			
 
				+            self.logger.info('Last page')
			
 
				+
			
 
				+    def parse_items(self, response):
			
 
				+        url = response.url
			
 
				+        title = response.css('title::text').get()
			
 
				+        source = '中国储能网'
			
 
				+        description = ''
			
 
				+        content = "".join("".join(response.css("div.entry p::text").getall()).split())
			
 
				+        date = time.time()
			
 
				+        column = '储能'
			
 
				+        self.logger.info(title)
			
 
				+        yield ElectricItem(url=url, title=title, source=source,
			
 
				+                            description=description, content=content,
			
 
				+                            date=date, column=column)
			
 
				+
			
 
				+    def errback_httpbin(self, failure):
			
 
				+        self.logger.error(repr(failure))
			
--- a/projects/electric/electric/spiders/ewindpower.py
+++ b/projects/electric/electric/spiders/ewindpower.py
@@ -0,0 +1,47 @@
 
				+import time
			
 
				+import scrapy
			
 
				+from electric.items import ElectricItem
			
 
				+
			
 
				+
			
 
				+# 国际风能网
			
 
				+class EwpcSpider(scrapy.Spider):
			
 
				+    name = 'ewindpower'
			
 
				+    download_delay = 15
			
 
				+    allowed_domains = ['ewindpower.cn']
			
 
				+    start_urls = [
			
 
				+        'http://www.ewindpower.cn/news/list-htm-catid-15-page-{}.html',
			
 
				+        'http://www.ewindpower.cn/news/list-htm-catid-14-page-{}.html',
			
 
				+    ]
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        yield scrapy.Request(url='http://www.ewindpower.cn/news/list-htm-catid-14.html', callback=self.parse_pages, errback=self.errback_httpbin)
			
 
				+        yield scrapy.Request(url='http://www.ewindpower.cn/news/list-htm-catid-15.html', callback=self.parse_pages, errback=self.errback_httpbin)
			
 
				+        for baseurl in self.start_urls:
			
 
				+            for page in range(2, 12):
			
 
				+                url = baseurl.format(page)
			
 
				+                self.logger.info('next page: {}'.format(url))
			
 
				+                yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin, dont_filter=True)
			
 
				+
			
 
				+    def parse_pages(self, response):
			
 
				+        for item in response.css('div#iframe_11 > span > table li'):
			
 
				+            url = item.css('a').attrib['href']
			
 
				+            yield response.follow(url=url, callback=self.parse_items, errback=self.errback_httpbin)
			
 
				+        # nextpage = response.css('div.pages a[title="下一页"]').attrib['href']
			
 
				+        # self.logger.info('next page: {}'.format(nextpage))
			
 
				+        # yield response.follow(url=url, callback=self.parse_pages, errback=self.errback_httpbin)
			
 
				+
			
 
				+    def parse_items(self, response):
			
 
				+        url = response.url
			
 
				+        title = response.css('title::text').get()
			
 
				+        source = '国际风能网'
			
 
				+        description = response.css('meta[name=description]::attr(content)').get()
			
 
				+        content = response.xpath('//div[@class="content"]//text()').getall()
			
 
				+        date = time.time()
			
 
				+        column = '风能'
			
 
				+        self.logger.info(title)
			
 
				+        yield ElectricItem(url=url, title=title, source=source,
			
 
				+                            description=description, content=content,
			
 
				+                            date=date, column=column)
			
 
				+
			
 
				+    def errback_httpbin(self, failure):
			
 
				+        self.logger.error(repr(failure))
			
--- a/projects/electric/electric/spiders/gxepa.py
+++ b/projects/electric/electric/spiders/gxepa.py
@@ -0,0 +1,53 @@
 
				+import time
			
 
				+import scrapy
			
 
				+from electric.items import ElectricItem
			
 
				+
			
 
				+# 广西电力行业协会
			
 
				+class GxepaSpider(scrapy.Spider):
			
 
				+    name = 'gxepa'
			
 
				+    allowed_domains = ['gxepa.org.cn']
			
 
				+    start_urls = [
			
 
				+        ('http://www.gxepa.org.cn/news_rdjj',102),
			
 
				+        ('http://www.gxepa.org.cn/news_xhyw',31),
			
 
				+        ('http://www.gxepa.org.cn/news_hyyw', 81),
			
 
				+    ]
			
 
				+
			
 
				+    custom_settings = {
			
 
				+        'DOWNLOAD_DELAY': 10,
			
 
				+        'DOWNLOAD_TIMEOUT': 1800,
			
 
				+    }
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        for url, maxpage in self.start_urls:
			
 
				+            yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin,
			
 
				+                meta={"url": url, "nextpage": 2, 'maxpage': maxpage},
			
 
				+                dont_filter=True)
			
 
				+
			
 
				+    def parse_pages(self, response):
			
 
				+        for item in response.xpath('//div[@class="list-news"]/ul/li'):
			
 
				+            url = item.css('a').attrib['href']
			
 
				+            self.logger.info('parse item: {}'.format(url))
			
 
				+            yield response.follow(url=url, callback=self.parse_item, errback=self.errback_httpbin)
			
 
				+        if response.meta['nextpage'] <= response.meta['maxpage']:
			
 
				+            nextpage = response.meta['url'] + '?pageNo={}'.format(response.meta['nextpage'])
			
 
				+            self.logger.info('next page: {}'.format(nextpage))
			
 
				+            yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin,
			
 
				+                meta={'url': response.meta['url'], "nextpage": response.meta['nextpage'] + 1,
			
 
				+                        'maxpage': response.meta['maxpage']},
			
 
				+                dont_filter=True)
			
 
				+
			
 
				+    def parse_item(self, response):
			
 
				+        url = response.url
			
 
				+        title = response.css('title::text').get()
			
 
				+        source = '广西电力行业协会'
			
 
				+        description = ''
			
 
				+        content = ''.join(''.join(response.xpath('//div[@id="ArtText"]//text()').getall()).split())
			
 
				+        date = time.time()
			
 
				+        column = ""
			
 
				+        self.logger.info('title: {}'.format(title))
			
 
				+        yield ElectricItem(url=url, title=title, source=source,
			
 
				+                            description=description, content=content,
			
 
				+                            date=date, column=column)
			
 
				+
			
 
				+    def errback_httpbin(self, failure):
			
 
				+        self.logger.error(repr(failure))
			
--- a/projects/electric/electric/spiders/iesplaza.py
+++ b/projects/electric/electric/spiders/iesplaza.py
@@ -0,0 +1,65 @@
 
				+import time
			
 
				+import scrapy
			
 
				+from electric.items import ElectricItem
			
 
				+# import scrapy_splash
			
 
				+# from scrapy_splash import SplashRequest
			
 
				+
			
 
				+# from scrapy.shell import inspect_response
			
 
				+# from scrapy.utils.response import open_in_browser
			
 
				+
			
 
				+
			
 
				+# 综合能源服务网
			
 
				+
			
 
				+class IespiazaSpider(scrapy.Spider):
			
 
				+    name = 'iesplaza'
			
 
				+    download_delay = 10
			
 
				+    allowed_domains = ['iesplaza.com']
			
 
				+    start_urls = [
			
 
				+        'https://www.iesplaza.com/news',
			
 
				+        'https://www.iesplaza.com/focus',
			
 
				+        'https://www.iesplaza.com/project',
			
 
				+        'https://www.iesplaza.com/company',
			
 
				+        'https://www.iesplaza.com/research',
			
 
				+        'https://www.iesplaza.com/viewpoint',
			
 
				+        'https://www.iesplaza.com/tech',
			
 
				+        'https://www.iesplaza.com/market',
			
 
				+        'https://www.iesplaza.com/case',
			
 
				+    ]
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        for url in self.start_urls:
			
 
				+            yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin, dont_filter=True)
			
 
				+            # yield SplashRequest(url=url, endpoint='render.html',
			
 
				+            #                     args={"wait": 5, 'timeout': 90}, callback=self.parse_pages)
			
 
				+
			
 
				+    def parse_pages(self, response):
			
 
				+        for item in response.css('div.list_msg'):
			
 
				+            itemurl = item.css('a::attr(href)').get()
			
 
				+            url = response.urljoin(itemurl)
			
 
				+            self.logger.info('parse item: {}'.format(url))
			
 
				+            yield scrapy.Request(url=url, callback=self.parse_items, errback=self.errback_httpbin)
			
 
				+            # yield SplashRequest(url=url, endpoint='render.html',
			
 
				+            #                     args={"wait": 5, 'timeout': 90}, callback=self.parse_items,
			
 
				+            #                     meta={"date": date, "column": column})
			
 
				+
			
 
				+        nextpage = response.css('ul.pagination li:nth-last-child(1) > a::attr(href)').get()
			
 
				+        url = response.urljoin(nextpage)
			
 
				+        self.logger.info('next page: {}'.format(url))
			
 
				+        yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin)
			
 
				+        # yield SplashRequest(url=url, endpoint='render.html',
			
 
				+        #                         args={"wait": 5, 'timeout': 90}, callback=self.parse_pages)
			
 
				+
			
 
				+    def parse_items(self, response):
			
 
				+        url = response.url
			
 
				+        title = response.css('title::text').get()
			
 
				+        source = "综合能源服务网"
			
 
				+        description = ''
			
 
				+        content = "".join(response.css("#ct > div.text_msg p::text").getall())
			
 
				+        date = time.time()
			
 
				+        column = '智慧能源'
			
 
				+        yield ElectricItem(url=url, title=title, source=source,
			
 
				+                            description=description, content=content,
			
 
				+                            date=date, column=column)
			
 
				+
			
 
				+    def errback_httpbin(self, failure):
			
 
				+        self.logger.error(repr(failure))
			
--- a/projects/electric/electric/spiders/nengyuanjie.py
+++ b/projects/electric/electric/spiders/nengyuanjie.py
@@ -0,0 +1,50 @@
 
				+import time
			
 
				+import scrapy
			
 
				+from electric.items import ElectricItem
			
 
				+
			
 
				+# 能源界
			
 
				+class NyjieSpider(scrapy.Spider):
			
 
				+    name = 'nengyuanjie'
			
 
				+    allowed_domains = ['nengyuanjie.net']
			
 
				+    start_urls = [
			
 
				+        ('http://www.nengyuanjie.net/series/chuneng.html',188,'储能'),
			
 
				+        ('http://www.nengyuanjie.net/series/hedian.html',180,'核电'),
			
 
				+        ('http://www.nengyuanjie.net/series/fengdian.html',188,'风电'),
			
 
				+        ('http://www.nengyuanjie.net/series/guangfu.html',257,'光伏'),
			
 
				+        ('http://www.nengyuanjie.net/series/nengyuanhulianwang.html',43,'计算机'),
			
 
				+        ('http://www.nengyuanjie.net/series/qingneng.html',90,'氢能'),
			
 
				+    ]
			
 
				+
			
 
				+    custom_settings = {
			
 
				+        'DOWNLOAD_DELAY': 10,
			
 
				+        'DOWNLOAD_TIMEOUT': 1800,
			
 
				+    }
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        for url, maxpage, column in self.start_urls:
			
 
				+            yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin, meta={"url": url, "nextpage": 2, 'maxpage': maxpage, 'column': column}, dont_filter=True)
			
 
				+
			
 
				+    def parse_pages(self, response):
			
 
				+        for item in response.xpath('//div[@class="lists"]/div[@class="li"]'):
			
 
				+            url = item.css('a').attrib['href']
			
 
				+            self.logger.info('parse item: {}'.format(url))
			
 
				+            yield response.follow(url=url, callback=self.parse_item, errback=self.errback_httpbin, meta={'column': response.meta['column']})
			
 
				+        if response.meta['nextpage'] <= response.meta['maxpage']:
			
 
				+            nextpage = response.meta['url'] + '?page={}'.format(response.meta['nextpage'])
			
 
				+            self.logger.info('next page: {}'.format(nextpage))
			
 
				+            yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin, meta={'url': response.meta['url'], "nextpage": response.meta['nextpage'] + 1, 'maxpage': response.meta['maxpage'], 'column': response.meta['column']}, dont_filter=True)
			
 
				+
			
 
				+    def parse_item(self, response):
			
 
				+        url = response.url
			
 
				+        title = response.css('title::text').get()
			
 
				+        source = '能源界'
			
 
				+        description = response.css('meta[name=description]::attr(content)').get()
			
 
				+        content = ''.join(''.join(response.xpath('//div[@class="content"]//text()').getall()).split())
			
 
				+        date = time.time()
			
 
				+        column = response.meta['column']
			
 
				+        yield ElectricItem(url=url, title=title, source=source,
			
 
				+                            description=description, content=content,
			
 
				+                            date=date, column=column)
			
 
				+
			
 
				+    def errback_httpbin(self, failure):
			
 
				+        self.logger.error(repr(failure))
			
--- a/projects/electric/electric/spiders/newenergy.py
+++ b/projects/electric/electric/spiders/newenergy.py
@@ -0,0 +1,63 @@
 
				+import time
			
 
				+import scrapy
			
 
				+from electric.items import ElectricItem
			
 
				+
			
 
				+# 新能源网
			
 
				+class NgccSpider(scrapy.Spider):
			
 
				+    name = 'newenergy'
			
 
				+    allowed_domains = ['newenergy.giec.cas.cn']
			
 
				+    start_urls = [
			
 
				+        ('http://newenergy.giec.cas.cn/tyn/xydt/index{}.html', 31, '光伏'),
			
 
				+        ('http://newenergy.giec.cas.cn/tyn/jrjj/index{}.html', 31, '光伏'),
			
 
				+        ('http://newenergy.giec.cas.cn/tyn/gfdt/index{}.html', 31, '光伏'),
			
 
				+        ('http://newenergy.giec.cas.cn/tyn/grdt/index{}.html', 15, '光伏'),
			
 
				+        ('http://newenergy.giec.cas.cn/tyn/jcyy/index{}.html', 31, '光伏'),
			
 
				+        ('http://newenergy.giec.cas.cn/tyn/cpyjs/index{}.html', 31, '光伏'),
			
 
				+        ('http://newenergy.giec.cas.cn/fn/jrjj_15706/index{}.html', 31, '风能'),
			
 
				+        ('http://newenergy.giec.cas.cn/fn/fndt/index{}.html', 31, '风能'),
			
 
				+        ('http://newenergy.giec.cas.cn/fn/cydt/index{}.html', 31, '风能'),
			
 
				+        ('http://newenergy.giec.cas.cn/fn/fnzy/index{}.html', 2, '风能'),
			
 
				+        ('http://newenergy.giec.cas.cn/fn/cpyjs_15710/index{}.html', 23, '风能'),
			
 
				+        ('http://newenergy.giec.cas.cn/xsdt/index{}.html', 39, ''),
			
 
				+    ]
			
 
				+
			
 
				+    custom_settings = {
			
 
				+        'DOWNLOAD_DELAY': 6,
			
 
				+        'DOWNLOAD_TIMEOUT': 1800,
			
 
				+    }
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        for baseurl, maxpage, tag in self.start_urls:
			
 
				+            url = baseurl.format("")
			
 
				+            yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin, dont_filter=True, meta={"url": baseurl.format('_{}'), "maxpage": maxpage, "nextpage": 1, "tag":tag})
			
 
				+
			
 
				+    def parse_pages(self, response):
			
 
				+        try:
			
 
				+            for item in response.xpath('//ul[@class="list_article"]/li'):
			
 
				+                url = item.css('a').attrib['href']
			
 
				+                yield response.follow(url=url, callback=self.parse_item, errback=self.errback_httpbin, meta={"url": url, "tag": response.meta['tag']})
			
 
				+        except:
			
 
				+            try:
			
 
				+                for item in response.xpath('//div/table[not(@class)]//tr'):
			
 
				+                    url = item.css('a').attrib['href']
			
 
				+                    yield response.follow(url=url, callback=self.parse_item, errback=self.errback_httpbin, meta={"url": url, "tag": response.meta['tag']})
			
 
				+            except:
			
 
				+                pass
			
 
				+        if response.meta['nextpage'] <= response.meta['maxpage']:
			
 
				+            nextpage = response.meta['url'].format(response.meta['nextpage'])
			
 
				+            yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin, meta={'url': response.meta['url'], "nextpage": response.meta['nextpage'] + 1, 'maxpage': response.meta['maxpage'], "tag": response.meta['tag']})
			
 
				+
			
 
				+    def parse_item(self, response):
			
 
				+        url = response.url
			
 
				+        title = response.css('title::text').get()
			
 
				+        source = '新能源网'
			
 
				+        description = ''
			
 
				+        content = response.xpath('//div[@class="TRS_Editor"]//text()').getall()
			
 
				+        date = time.time()
			
 
				+        column = response.meta['tag']
			
 
				+        yield ElectricItem(url=url, title=title, source=source,
			
 
				+                            description=description, content=content,
			
 
				+                            date=date, column=column)
			
 
				+
			
 
				+    def errback_httpbin(self, failure):
			
 
				+        self.logger.error(repr(failure))
			
--- a/projects/electric/electric/spiders/piec.py
+++ b/projects/electric/electric/spiders/piec.py
@@ -0,0 +1,42 @@
 
				+import time
			
 
				+import scrapy
			
 
				+from electric.items import ElectricItem
			
 
				+
			
 
				+# 国际电力网
			
 
				+class PiecSpider(scrapy.Spider):
			
 
				+    name = 'piec'
			
 
				+    allowed_domains = ['power.in-en.com']
			
 
				+    start_urls = ['https://power.in-en.com/visit/news/']
			
 
				+    
			
 
				+    custom_settings = {
			
 
				+        'DOWNLOAD_DELAY': 6,
			
 
				+        'DOWNLOAD_TIMEOUT': 1800,
			
 
				+    }
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        for url in self.start_urls:
			
 
				+            yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin, dont_filter=True, meta={"url": url, "maxpage": 262, "nextpage": 2})
			
 
				+
			
 
				+    def parse_pages(self, response):
			
 
				+        for item in response.xpath('//ul[@class="infoList"]/li/div[@class="listTxt"]'):
			
 
				+            url = item.css('h5 > a').attrib['href']
			
 
				+            yield response.follow(url=url, callback=self.parse_item, errback=self.errback_httpbin, meta={"url": url})
			
 
				+        if response.meta['nextpage'] <= response.meta['maxpage']:
			
 
				+            nextpage = response.meta['url'] + 'list801-{}.html'.format(response.meta['nextpage'])
			
 
				+            self.logger.info('next page: {}'.format(nextpage))
			
 
				+            yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin, meta={'url': response.meta['url'], "nextpage": response.meta['nextpage'] + 1, 'maxpage': response.meta['maxpage']})
			
 
				+
			
 
				+    def parse_item(self, response):
			
 
				+        url = response.url
			
 
				+        title = response.css('title::text').get()
			
 
				+        source = '国际电力网'
			
 
				+        description = response.css('meta[name=description]::attr(content)').get()
			
 
				+        content = ''.join(''.join(response.xpath('//div[@id="article"]//text()').getall()).split())
			
 
				+        date = time.time()
			
 
				+        column = ''
			
 
				+        yield ElectricItem(url=url, title=title, source=source,
			
 
				+                            description=description, content=content,
			
 
				+                            date=date, column=column)
			
 
				+
			
 
				+    def errback_httpbin(self, failure):
			
 
				+        self.logger.error(repr(failure))
			
--- a/projects/electric/electric/spiders/powerchina.py
+++ b/projects/electric/electric/spiders/powerchina.py
@@ -0,0 +1,56 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+import time
			
 
				+
			
 
				+import scrapy
			
 
				+from electric.items import ElectricItem
			
 
				+
			
 
				+
			
 
				+# 中国电建
			
 
				+# 公司要闻、基层动态、国际项目、重点报道、国资动态、行业信息、媒体聚焦、科技动态
			
 
				+class PcnSpider(scrapy.Spider):
			
 
				+    name = 'powerchina'
			
 
				+    download_delay = 15
			
 
				+    allowed_domains = ['powerchina.cn']
			
 
				+    start_urls = [
			
 
				+        ('https://www.powerchina.cn/col/col7440/index.html',776),
			
 
				+        ('https://www.powerchina.cn/col/col7442/index.html',4225),
			
 
				+        ('https://www.powerchina.cn/col/col7449/index.html',1047),
			
 
				+        ('https://www.powerchina.cn/col/col7450/index.html',1322),
			
 
				+        ('https://www.powerchina.cn/col/col7457/index.html',231),
			
 
				+        ('https://www.powerchina.cn/col/col7459/index.html',1110),
			
 
				+        ('https://www.powerchina.cn/col/col7460/index.html',282),
			
 
				+        ('https://www.powerchina.cn/col/col7461/index.html',279),
			
 
				+    ]
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        for url, maxpage in self.start_urls:
			
 
				+            yield scrapy.Request(url=url, callback=self.parse, errback=self.errback_httpbin, dont_filter=True, meta={"url":url, "maxpage":maxpage, "curpage": 1})
			
 
				+
			
 
				+    def parse(self, response):
			
 
				+        for curpage in range(1, response.meta['maxpage']):
			
 
				+            nextpage = response.meta['url'] + r'?uid=46098&pageNum={}'.format(curpage + 1)
			
 
				+            self.logger.info('next page: {}'.format(nextpage))
			
 
				+            yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin, dont_filter=True, meta={"url":response.meta['url'], "maxpage":response.meta['maxpage'], "curpage": response.meta['curpage'] + 1})
			
 
				+
			
 
				+    def parse_pages(self, response):
			
 
				+        for url in response.xpath('//div[@id="46098"]/script').re(r'href=\'(.*?)\''):
			
 
				+            url = response.urljoin(url)
			
 
				+            self.logger.info('parse items: {}'.format(url))
			
 
				+            yield scrapy.Request(url=url, callback=self.parse_items, errback=self.errback_httpbin, dont_filter=True)
			
 
				+
			
 
				+    def parse_items(self, response):
			
 
				+        url = response.url
			
 
				+        title = response.css('title::text').get()
			
 
				+        source = '中国电建'
			
 
				+        description = response.css('meta[name=description]::attr(content)').get()
			
 
				+        content = ''.join(''.join(response.css('div#zoom p::text').getall()).split())
			
 
				+        date = time.time()
			
 
				+        column = ''
			
 
				+        self.logger.info(title)
			
 
				+        yield ElectricItem(url=url, title=title, source=source,
			
 
				+                            description=description, content=content,
			
 
				+                            date=date, column=column)
			
 
				+
			
 
				+    def errback_httpbin(self, failure):
			
 
				+        self.logger.error(repr(failure))
			
 
				+
			
--- a/projects/electric/electric/spiders/solarbe.py
+++ b/projects/electric/electric/spiders/solarbe.py
@@ -0,0 +1,45 @@
 
				+import time
			
 
				+import scrapy
			
 
				+from electric.items import ElectricItem
			
 
				+
			
 
				+
			
 
				+# 索比光伏网
			
 
				+# 光伏技术
			
 
				+class SolarbeSpider(scrapy.Spider):
			
 
				+    name = 'solarbe'
			
 
				+    download_delay = 10
			
 
				+    allowed_domains = ['solarbe.com']
			
 
				+    start_urls = [
			
 
				+        'https://news.solarbe.com/qiye/',
			
 
				+        'https://news.solarbe.com/xiangmu/',
			
 
				+        'https://news.solarbe.com/technology/'
			
 
				+    ]
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        for url in self.start_urls:
			
 
				+            yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin, dont_filter=True)
			
 
				+
			
 
				+    def parse_pages(self, response):
			
 
				+        for item in response.css('div.listleft div.newslistItem'):
			
 
				+            url = item.css('a.newsTitle').attrib['href']
			
 
				+            yield response.follow(url=url, callback=self.parse_items, errback=self.errback_httpbin)
			
 
				+        try:
			
 
				+            nextpage = response.css('div.pages a').re(r'href="(.*?)">')[-1]
			
 
				+            yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin)
			
 
				+        except:
			
 
				+            self.logger.info('Last page')
			
 
				+
			
 
				+    def parse_items(self, response):
			
 
				+        url = response.url
			
 
				+        title = response.css('title::text').get()
			
 
				+        source = "索比光伏网"
			
 
				+        description = response.css('meta[name=description]::attr(content)').get()
			
 
				+        content = ''.join(''.join(response.css('div.mainbody-body-con p::text').getall()).split())
			
 
				+        date = time.time()
			
 
				+        column = '光伏'
			
 
				+        yield ElectricItem(url=url, title=title, source=source,
			
 
				+                            description=description, content=content,
			
 
				+                            date=date, column=column)
			
 
				+
			
 
				+    def errback_httpbin(self, failure):
			
 
				+        self.logger.error(repr(failure))
			
--- a/projects/electric/electric/spiders/solarenpv.py
+++ b/projects/electric/electric/spiders/solarenpv.py
@@ -0,0 +1,47 @@
 
				+import time
			
 
				+import scrapy
			
 
				+from electric.items import ElectricItem
			
 
				+
			
 
				+# 光伏产业网
			
 
				+class SolapvSpider(scrapy.Spider):
			
 
				+    name = 'solarenpv'
			
 
				+    allowed_domains = ['solarenpv.com']
			
 
				+    start_urls = [
			
 
				+        'http://www.solarenpv.com/',
			
 
				+    ]
			
 
				+
			
 
				+    custom_settings = {
			
 
				+        'DOWNLOAD_DELAY': 10,
			
 
				+    }
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        for url in self.start_urls:
			
 
				+            yield scrapy.Request(url=url, callback=self.parse_classes, errback=self.errback_httpbin, dont_filter=True)
			
 
				+
			
 
				+    def parse_classes(self, response):
			
 
				+        for url in response.css('.m_r .ibox_head a::attr(href)').getall():
			
 
				+            yield response.follow(url=url, callback=self.parse_pages, errback=self.errback_httpbin, dont_filter=True)
			
 
				+
			
 
				+    def parse_pages(self, response):
			
 
				+        for url in response.css(".catlist_li a::attr(href)").getall():
			
 
				+            yield response.follow(url=url, callback=self.parse_item, errback=self.errback_httpbin)
			
 
				+        try:
			
 
				+            url = response.css('.pages a:nth-of-type(10)::attr(href)').get()
			
 
				+            yield response.follow(url=url, callback=self.parse_pages, errback=self.errback_httpbin)
			
 
				+        except:
			
 
				+            self.logger.info('Last page')
			
 
				+
			
 
				+    def parse_item(self, response):
			
 
				+        url = response.url
			
 
				+        title = response.css('title::text').get()
			
 
				+        source = '光伏产业网'
			
 
				+        description = response.css('meta[name=description]::attr(content)').get()
			
 
				+        content = ''.join(''.join(response.xpath('//div[@id="article"]//text()').getall()).split())
			
 
				+        date = time.time()
			
 
				+        column = '光伏'
			
 
				+        yield ElectricItem(url=url, title=title, source=source,
			
 
				+                            description=description, content=content,
			
 
				+                            date=date, column=column)
			
 
				+
			
 
				+    def errback_httpbin(self, failure):
			
 
				+        self.logger.error(repr(failure))
			
--- a/projects/electric/electric/spiders/sungrow.py
+++ b/projects/electric/electric/spiders/sungrow.py
@@ -0,0 +1,55 @@
 
				+import scrapy
			
 
				+from electric.items import ElectricItem
			
 
				+
			
 
				+# 阳光电源股份有限公司
			
 
				+# 全
			
 
				+# 完成
			
 
				+class SungrowSpider(scrapy.Spider):
			
 
				+    name = 'sungrow'
			
 
				+    download_delay = 20
			
 
				+    allowed_domains = ['yangg.solarbe.com']
			
 
				+    start_urls = [
			
 
				+        'https://yangg.solarbe.com/news/',
			
 
				+        'https://yangg.solarbe.com/news/page-2.shtml',
			
 
				+        'https://yangg.solarbe.com/news/page-3.shtml',
			
 
				+        'https://yangg.solarbe.com/news/page-4.shtml',
			
 
				+        'https://yangg.solarbe.com/news/page-5.shtml',
			
 
				+        'https://yangg.solarbe.com/news/page-6.shtml',
			
 
				+        'https://yangg.solarbe.com/news/page-7.shtml',
			
 
				+        'https://yangg.solarbe.com/news/page-8.shtml',
			
 
				+        'https://yangg.solarbe.com/news/page-9.shtml',
			
 
				+        'https://yangg.solarbe.com/news/page-10.shtml',
			
 
				+        'https://yangg.solarbe.com/news/page-11.shtml',
			
 
				+        'https://yangg.solarbe.com/news/page-12.shtml',
			
 
				+        'https://yangg.solarbe.com/news/page-13.shtml',
			
 
				+    ]
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        for url in self.start_urls:
			
 
				+            yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin)
			
 
				+
			
 
				+    def parse_pages(self, response):
			
 
				+        for item in response.xpath('//div[@class="main_body"]//tr'):
			
 
				+            url = item.css('a').attrib['href']
			
 
				+            date = item.css('td.f_gray').get()
			
 
				+            self.logger.info('parse item: {}'.format(url))
			
 
				+            yield response.follow(url=url, callback=self.parse_items, errback=self.errback_httpbin, meta={"date": date})
			
 
				+        # nextpage = response.css("div.pages a").getall()[-1]
			
 
				+        # self.logger.info('next page: {}'.format(nextpage))
			
 
				+        # yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin)
			
 
				+
			
 
				+    def parse_items(self, response):
			
 
				+        url = response.url
			
 
				+        title = response.css('title::text').get()
			
 
				+        source = '阳光电源股份有限公司'
			
 
				+        description = response.css('meta[name=description]::attr(content)').get()
			
 
				+        content = response.xpath('//*[@id="content"]//p//text()').getall()
			
 
				+        date = response.meta['date']
			
 
				+        column = response.css('meta[name=keywords]::attr(content)').get()
			
 
				+        self.logger.info(title)
			
 
				+        yield ElectricItem(url=url, title=title, source=source,
			
 
				+                            description=description, content=content,
			
 
				+                            date=date, column=column)
			
 
				+
			
 
				+    def errback_httpbin(self, failure):
			
 
				+        self.logger.error(repr(failure))
			
--- a/projects/electric/electric/spiders/twea.py
+++ b/projects/electric/electric/spiders/twea.py
@@ -0,0 +1,46 @@
 
				+import time
			
 
				+
			
 
				+import scrapy
			
 
				+from electric.items import ElectricItem
			
 
				+
			
 
				+# 天津市新能源协会
			
 
				+class TweaSpider(scrapy.Spider):
			
 
				+    name = 'twea'
			
 
				+    allowed_domains = ['twea.org.cn']
			
 
				+    start_urls = [
			
 
				+        ('http://www.twea.org.cn/?/article/', 39),
			
 
				+        ('http://www.twea.org.cn/?/hyzixun/', 102),
			
 
				+    ]
			
 
				+
			
 
				+    custom_settings = {
			
 
				+        'DOWNLOAD_DELAY': 10,
			
 
				+    }
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        for url, maxpage in self.start_urls:
			
 
				+            yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin, meta={"url": url, "nextpage": 2, 'maxpage': maxpage}, dont_filter=True)
			
 
				+
			
 
				+    def parse_pages(self, response):
			
 
				+        for item in response.xpath('//ul[@class="main"]/li'):
			
 
				+            url = item.css('a').attrib['href']
			
 
				+            self.logger.info('parse item: {}'.format(url))
			
 
				+            yield response.follow(url=url, callback=self.parse_item, errback=self.errback_httpbin)
			
 
				+        if response.meta['nextpage'] <= response.meta['maxpage']:
			
 
				+            nextpage = response.meta['url'] + 'page-{}/index.html'.format(response.meta['nextpage'])
			
 
				+            self.logger.info('next page: {}'.format(nextpage))
			
 
				+            yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin, meta={'url': response.meta['url'], "nextpage": response.meta['nextpage'] + 1, 'maxpage': response.meta['maxpage']})
			
 
				+
			
 
				+    def parse_item(self, response):
			
 
				+        url = response.url
			
 
				+        title = response.css('title::text').get()
			
 
				+        source = '天津市新能源协会'
			
 
				+        description = ""
			
 
				+        content = ''.join(''.join(response.xpath('//div[@class="main"]/p/text()').getall()).split())
			
 
				+        date = time.time()
			
 
				+        column = ''
			
 
				+        yield ElectricItem(url=url, title=title, source=source,
			
 
				+                            description=description, content=content,
			
 
				+                            date=date, column=column)
			
 
				+
			
 
				+    def errback_httpbin(self, failure):
			
 
				+        self.logger.error(repr(failure))
			
--- a/projects/electric/electric/spiders/xhhydropower.py
+++ b/projects/electric/electric/spiders/xhhydropower.py
@@ -0,0 +1,51 @@
 
				+import time
			
 
				+import scrapy
			
 
				+from electric.items import ElectricItem
			
 
				+
			
 
				+
			
 
				+class XhslSpider(scrapy.Spider):
			
 
				+    name = 'xhhydropower'
			
 
				+    allowed_domains = ['xhhydropower.com']
			
 
				+    start_urls = [
			
 
				+        'https://www.xhhydropower.com/xhslfd/xwzx33/xhyw/index.html',
			
 
				+    ]
			
 
				+
			
 
				+    custom_settings = {
			
 
				+        'DOWNLOAD_DELAY': 15,
			
 
				+        'DOWNLOAD_TIMEOUT': 1800,
			
 
				+        'CLOSESPIDER_TIMEOUT': 0,
			
 
				+        'CLOSESPIDER_ERRORCOUNT': 0,
			
 
				+    }
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        for url in self.start_urls:
			
 
				+            yield scrapy.Request(url=url, callback=self.parse_classes, errback=self.errback_httpbin, dont_filter=True)
			
 
				+
			
 
				+    def parse_classes(self, response):
			
 
				+        for url in response.css('.li_3 a.leftsubchecked01::attr(href)').getall():
			
 
				+            yield response.follow(url=url, callback=self.parse_pages, errback=self.errback_httpbin, dont_filter=True)
			
 
				+
			
 
				+    def parse_pages(self, response):
			
 
				+        for url in response.css('.ttone-title a::attr(href)').getall():
			
 
				+            yield response.follow(url=url, callback=self.parse_item, errback=self.errback_httpbin)
			
 
				+        try:
			
 
				+            url = response.css('a[title="下一页"]').re('tagname="(.*?)"')[0]
			
 
				+            yield response.follow(url=url, callback=self.parse_pages, errback=self.errback_httpbin)
			
 
				+        except:
			
 
				+            self.logger.info('Last page')
			
 
				+
			
 
				+    def parse_item(self, response):
			
 
				+        url = response.url
			
 
				+        title = response.css('title::text').get()
			
 
				+        source = '新华水力发电'
			
 
				+        description = ''
			
 
				+        content = ''.join(''.join(response.xpath('//div[@class="content-box"]//text()').getall()).split())
			
 
				+        date = time.time()
			
 
				+        column = '水电'
			
 
				+        self.logger.info('title: {}'.format(title))
			
 
				+        yield ElectricItem(url=url, title=title, source=source,
			
 
				+                            description=description, content=content,
			
 
				+                            date=date, column=column)
			
 
				+
			
 
				+    def errback_httpbin(self, failure):
			
 
				+        self.logger.error(repr(failure))
			
--- a/projects/electric/electric/spiders/zzsolar.py
+++ b/projects/electric/electric/spiders/zzsolar.py
@@ -0,0 +1,45 @@
 
				+import time
			
 
				+import scrapy
			
 
				+from electric.items import ElectricItem
			
 
				+
			
 
				+## 郑州国际太阳能光伏展览会
			
 
				+class ZzsolarSpider(scrapy.Spider):
			
 
				+    name = 'zzsolar'
			
 
				+    allowed_domains = ['zzsolar.com.cn']
			
 
				+    start_urls = [
			
 
				+        ('https://zzsolar.com.cn/a/news/hyxw/list_13_{}.html', 122),
			
 
				+    ]
			
 
				+
			
 
				+    custom_settings = {
			
 
				+        'DOWNLOAD_DELAY': 10,
			
 
				+    }
			
 
				+
			
 
				+    def start_requests(self):
			
 
				+        for baseurl, maxpage in self.start_urls:
			
 
				+            url = baseurl.format(1)
			
 
				+            yield scrapy.Request(url=url, callback=self.parse_pages, errback=self.errback_httpbin, meta={"url": baseurl, "nextpage": 2, 'maxpage': maxpage}, dont_filter=True)
			
 
				+
			
 
				+    def parse_pages(self, response):
			
 
				+        for item in response.xpath('//div[@class="cont thumblist1"]/ul/li'):
			
 
				+            url = item.css('a.fl').attrib['href']
			
 
				+            self.logger.info('parse item: {}'.format(url))
			
 
				+            yield response.follow(url=url, callback=self.parse_item, errback=self.errback_httpbin)
			
 
				+        if response.meta['nextpage'] <= response.meta['maxpage']:
			
 
				+            nextpage = response.meta['url'].format(response.meta['nextpage'])
			
 
				+            self.logger.info('next page: {}'.format(nextpage))
			
 
				+            yield response.follow(url=nextpage, callback=self.parse_pages, errback=self.errback_httpbin, meta={'url': response.meta['url'], "nextpage": response.meta['nextpage'] + 1, 'maxpage': response.meta['maxpage']})
			
 
				+    
			
 
				+    def parse_item(self, response):
			
 
				+        url = response.url
			
 
				+        title = response.css('h1::text').get()
			
 
				+        source = '郑州国际太阳能光伏展览会'
			
 
				+        description = ''
			
 
				+        content = response.xpath('//div[@class="showContxt"]//text()').getall()
			
 
				+        date = time.time()
			
 
				+        column = '光伏'
			
 
				+        yield ElectricItem(url=url, title=title, source=source,
			
 
				+                            description=description, content=content,
			
 
				+                            date=date, column=column)
			
 
				+
			
 
				+    def errback_httpbin(self, failure):
			
 
				+        self.logger.error(repr(failure))
			
--- a/projects/electric/scrapy.cfg
+++ b/projects/electric/scrapy.cfg
@@ -0,0 +1,11 @@
 
				+# Automatically created by: scrapy startproject
			
 
				+#
			
 
				+# For more information about the [deploy] section see:
			
 
				+# https://scrapyd.readthedocs.io/en/latest/deploy.html
			
 
				+
			
 
				+[settings]
			
 
				+default = electric.settings
			
 
				+
			
 
				+[deploy:electric]
			
 
				+url = http://localhost:6800/
			
 
				+project = electric
			
--- a/projects/electric/setup.py
+++ b/projects/electric/setup.py
@@ -0,0 +1,10 @@
 
				+# Automatically created by: scrapydweb x scrapyd-client

			
 
				+

			
 
				+from setuptools import setup, find_packages

			
 
				+

			
 
				+setup(

			
 
				+    name         = 'project',

			
 
				+    version      = '1.0',

			
 
				+    packages     = find_packages(),

			
 
				+    entry_points = {'scrapy': ['settings = electric.settings']},

			
 
				+)

			
--- a/scrapydweb_settings_v10.py
+++ b/scrapydweb_settings_v10.py
@@ -0,0 +1,358 @@
 
				+# coding: utf-8

			
 
				+"""

			
 
				+How ScrapydWeb works:

			
 
				+BROWSER <<<>>> SCRAPYDWEB_BIND:SCRAPYDWEB_PORT <<<>>> your SCRAPYD_SERVERS

			
 
				+

			
 
				+GitHub: https://github.com/my8100/scrapydweb

			
 
				+DOCS: https://github.com/my8100/files/blob/master/scrapydweb/README.md

			
 
				+文档：https://github.com/my8100/files/blob/master/scrapydweb/README_CN.md

			
 
				+"""

			
 
				+import os

			
 
				+

			
 
				+

			
 
				+############################## QUICK SETUP start ##############################

			
 
				+############################## 快速设置 开始 ###################################

			
 
				+# Setting SCRAPYDWEB_BIND to '0.0.0.0' or IP-OF-THE-CURRENT-HOST would make

			
 
				+# ScrapydWeb server visible externally; Otherwise, set it to '127.0.0.1'.

			
 
				+# The default is '0.0.0.0'.

			
 
				+SCRAPYDWEB_BIND = '0.0.0.0'

			
 
				+# Accept connections on the specified port, the default is 5000.

			
 
				+SCRAPYDWEB_PORT = 5000

			
 
				+

			
 
				+# The default is False, set it to True to enable basic auth for the web UI.

			
 
				+ENABLE_AUTH = False

			
 
				+# In order to enable basic auth, both USERNAME and PASSWORD should be non-empty strings.

			
 
				+USERNAME = ''

			
 
				+PASSWORD = ''

			
 
				+

			
 
				+

			
 
				+# Make sure that [Scrapyd](https://github.com/scrapy/scrapyd) has been installed

			
 
				+# and started on all of your hosts.

			
 
				+# Note that for remote access, you have to manually set 'bind_address = 0.0.0.0'

			
 
				+# in the configuration file of Scrapyd and restart Scrapyd to make it visible externally.

			
 
				+# Check out 'https://scrapyd.readthedocs.io/en/latest/config.html#example-configuration-file' for more info.

			
 
				+# ------------------------------ Chinese --------------------------------------

			
 
				+# 请先确保所有主机都已经安装和启动 [Scrapyd](https://github.com/scrapy/scrapyd)。

			
 
				+# 如需远程访问 Scrapyd，则需在 Scrapyd 配置文件中设置 'bind_address = 0.0.0.0'，然后重启 Scrapyd。

			
 
				+# 详见 https://scrapyd.readthedocs.io/en/latest/config.html#example-configuration-file

			
 
				+

			
 
				+# - the string format: username:password@ip:port#group

			
 
				+#   - The default port would be 6800 if not provided,

			
 
				+#   - Both basic auth and group are optional.

			
 
				+#   - e.g. '127.0.0.1:6800' or 'username:password@localhost:6801#group'

			
 
				+# - the tuple format: (username, password, ip, port, group)

			
 
				+#   - When the username, password, or group is too complicated (e.g. contains ':@#'),

			
 
				+#   - or if ScrapydWeb fails to parse the string format passed in,

			
 
				+#   - it's recommended to pass in a tuple of 5 elements.

			
 
				+#   - e.g. ('', '', '127.0.0.1', '6800', '') or ('username', 'password', 'localhost', '6801', 'group')

			
 
				+SCRAPYD_SERVERS = [

			
 
				+    '127.0.0.1:6800',

			
 
				+    # 'username:password@localhost:6801#group',

			
 
				+    # ('username', 'password', 'localhost', '6801', 'group'),

			
 
				+]

			
 
				+

			
 
				+

			
 
				+# It's recommended to update the three options below

			
 
				+# if both ScrapydWeb and one of your Scrapyd servers run on the same machine.

			
 
				+# ------------------------------ Chinese --------------------------------------

			
 
				+# 假如 ScrapydWeb 和某个 Scrapyd 运行于同一台主机，建议更新如下三个设置项。

			
 
				+

			
 
				+# If both ScrapydWeb and one of your Scrapyd servers run on the same machine,

			
 
				+# ScrapydWeb would try to directly read Scrapy logfiles from disk, instead of making a request

			
 
				+# to the Scrapyd server.

			
 
				+# e.g. '127.0.0.1:6800' or 'localhost:6801', do not forget the port number.

			
 
				+LOCAL_SCRAPYD_SERVER = '127.0.0.1:6800'

			
 
				+

			
 
				+# Enter the directory when you run Scrapyd, run the command below

			
 
				+# to find out where the Scrapy logs are stored:

			
 
				+# python -c "from os.path import abspath, isdir; from scrapyd.config import Config; path = abspath(Config().get('logs_dir')); print(path); print(isdir(path))"

			
 
				+# Check out https://scrapyd.readthedocs.io/en/stable/config.html#logs-dir for more info.

			
 
				+# e.g. 'C:/Users/username/logs' or '/home/username/logs'

			
 
				+LOCAL_SCRAPYD_LOGS_DIR = './logs'

			
 
				+

			
 
				+# The default is False, set it to True to automatically run LogParser as a subprocess at startup.

			
 
				+# Note that you can run the LogParser service separately via command 'logparser' as you like.

			
 
				+# Run 'logparser -h' to find out the config file of LogParser for more advanced settings.

			
 
				+# Visit https://github.com/my8100/logparser for more info.

			
 
				+ENABLE_LOGPARSER = True

			
 
				+############################## QUICK SETUP end ################################

			
 
				+############################## 快速设置 结束 ###################################

			
 
				+

			
 
				+

			
 
				+############################## ScrapydWeb #####################################

			
 
				+# The default is False, set it to True and add both CERTIFICATE_FILEPATH and PRIVATEKEY_FILEPATH

			
 
				+# to run ScrapydWeb in HTTPS mode.

			
 
				+# Note that this feature is not fully tested, please leave your comment here if ScrapydWeb

			
 
				+# raises any excepion at startup: https://github.com/my8100/scrapydweb/issues/18

			
 
				+ENABLE_HTTPS = False

			
 
				+# e.g. '/home/username/cert.pem'

			
 
				+CERTIFICATE_FILEPATH = ''

			
 
				+# e.g. '/home/username/cert.key'

			
 
				+PRIVATEKEY_FILEPATH = ''

			
 
				+

			
 
				+

			
 
				+############################## Scrapy #########################################

			
 
				+# ScrapydWeb is able to locate projects in the SCRAPY_PROJECTS_DIR,

			
 
				+# so that you can simply select a project to deploy, instead of packaging it in advance.

			
 
				+# e.g. 'C:/Users/username/myprojects' or '/home/username/myprojects'

			
 
				+SCRAPY_PROJECTS_DIR = './projects'

			
 
				+

			
 
				+

			
 
				+############################## Scrapyd ########################################

			
 
				+# ScrapydWeb would try every extension in sequence to locate the Scrapy logfile.

			
 
				+# The default is ['.log', '.log.gz', '.txt'].

			
 
				+SCRAPYD_LOG_EXTENSIONS = ['.log', '.log.gz', '.txt']

			
 
				+

			
 
				+

			
 
				+############################## LogParser ######################################

			
 
				+# Whether to backup the stats json files locally after you visit the Stats page of a job

			
 
				+# so that it is still accessible even if the original logfile has been deleted.

			
 
				+# The default is True, set it to False to disable this behaviour.

			
 
				+BACKUP_STATS_JSON_FILE = True

			
 
				+

			
 
				+

			
 
				+############################## Timer Tasks ####################################

			
 
				+# Run ScrapydWeb with argument '-sw' or '--switch_scheduler_state', or click the ENABLED|DISABLED button

			
 
				+# on the Timer Tasks page to turn on/off the scheduler for the timer tasks and the snapshot mechanism below.

			
 
				+

			
 
				+# The default is 300, which means ScrapydWeb would automatically create a snapshot of the Jobs page

			
 
				+# and save the jobs info in the database in the background every 300 seconds.

			
 
				+# Note that this behaviour would be paused if the scheduler for timer tasks is disabled.

			
 
				+# Set it to 0 to disable this behaviour.

			
 
				+JOBS_SNAPSHOT_INTERVAL = 300

			
 
				+

			
 
				+

			
 
				+############################## Run Spider #####################################

			
 
				+# The default is False, set it to True to automatically

			
 
				+# expand the 'settings & arguments' section in the Run Spider page.

			
 
				+SCHEDULE_EXPAND_SETTINGS_ARGUMENTS = False

			
 
				+

			
 
				+# The default is 'Mozilla/5.0', set it a non-empty string to customize the default value of `custom`

			
 
				+# in the drop-down list of `USER_AGENT`.

			
 
				+SCHEDULE_CUSTOM_USER_AGENT = 'Mozilla/5.0'

			
 
				+

			
 
				+# The default is None, set it to any value of ['custom', 'Chrome', 'iPhone', 'iPad', 'Android']

			
 
				+# to customize the default value of `USER_AGENT`.

			
 
				+SCHEDULE_USER_AGENT = None

			
 
				+

			
 
				+# The default is None, set it to True or False to customize the default value of `ROBOTSTXT_OBEY`.

			
 
				+SCHEDULE_ROBOTSTXT_OBEY = None

			
 
				+

			
 
				+# The default is None, set it to True or False to customize the default value of `COOKIES_ENABLED`.

			
 
				+SCHEDULE_COOKIES_ENABLED = None

			
 
				+

			
 
				+# The default is None, set it to a non-negative integer to customize the default value of `CONCURRENT_REQUESTS`.

			
 
				+SCHEDULE_CONCURRENT_REQUESTS = None

			
 
				+

			
 
				+# The default is None, set it to a non-negative number to customize the default value of `DOWNLOAD_DELAY`.

			
 
				+SCHEDULE_DOWNLOAD_DELAY = 5

			
 
				+

			
 
				+# The default is "-d setting=CLOSESPIDER_TIMEOUT=60\r\n-d setting=CLOSESPIDER_PAGECOUNT=10\r\n-d arg1=val1",

			
 
				+# set it to '' or any non-empty string to customize the default value of `additional`.

			
 
				+# Use '\r\n' as the line separator.

			
 
				+SCHEDULE_ADDITIONAL = "-d setting=CLOSESPIDER_TIMEOUT=60\r\n-d setting=CLOSESPIDER_PAGECOUNT=10\r\n-d arg1=val1"

			
 
				+

			
 
				+

			
 
				+############################## Page Display ###################################

			
 
				+# The default is True, set it to False to hide the Items page, as well as

			
 
				+# the Items column in the Jobs page.

			
 
				+SHOW_SCRAPYD_ITEMS = True

			
 
				+

			
 
				+# The default is True, set it to False to hide the Job column in the Jobs page with non-database view.

			
 
				+SHOW_JOBS_JOB_COLUMN = True

			
 
				+

			
 
				+# The default is 0, which means unlimited, set it to a positive integer so that

			
 
				+# only the latest N finished jobs would be shown in the Jobs page with non-database view.

			
 
				+JOBS_FINISHED_JOBS_LIMIT = 0

			
 
				+

			
 
				+# If your browser stays on the Jobs page, it would be reloaded automatically every N seconds.

			
 
				+# The default is 300, set it to 0 to disable auto-reloading.

			
 
				+JOBS_RELOAD_INTERVAL = 300

			
 
				+

			
 
				+# The load status of the current Scrapyd server is checked every N seconds,

			
 
				+# which is displayed in the top right corner of the page.

			
 
				+# The default is 10, set it to 0 to disable auto-refreshing.

			
 
				+DAEMONSTATUS_REFRESH_INTERVAL = 10

			
 
				+

			
 
				+

			
 
				+############################## Send Text ######################################

			
 
				+########## usage in scrapy projects ##########

			
 
				+# See the "Send Text" page

			
 
				+

			
 
				+########## slack ##########

			
 
				+# How to create a slack app:

			
 
				+# 1. Visit https://api.slack.com/apps and press the "Create New App" button.

			
 
				+# 2. Enter your App Name (e.g. myapp)and select one of your Slack Workspaces, the press "Create App".

			
 
				+# 3. Click the "OAuth & Permissions" menu in the sidebar on the left side of the page.

			
 
				+# 4. Scroll down the page and find out "Select Permission Scopes" in the "Scopes" section

			
 
				+# 5. Enter "send" and select "Send messages as <your-app-name>", then press "Save Changes"

			
 
				+# 6. Scroll up the page and press "Install App to Workspace", then press "Install"

			
 
				+# 7. Copy the "OAuth Access Token", e.g. xoxp-123-456-789-abcde

			
 
				+# See https://api.slack.com/apps for more info

			
 
				+

			
 
				+# See step 1~7 above, e.g. 'xoxp-123-456-789-abcde'

			
 
				+SLACK_TOKEN = os.environ.get('SLACK_TOKEN', '')

			
 
				+# The default channel to use when sending text via slack, e.g. 'general'

			
 
				+SLACK_CHANNEL = 'general'

			
 
				+

			
 
				+########## telegram ##########

			
 
				+# How to create a telegram bot:

			
 
				+# 1. Visit https://telegram.me/botfather to start a conversation with Telegram's bot that creates other bots.

			
 
				+# 2. Send the /newbot command to create a new bot in a chat with BotFather.

			
 
				+# 3. Follow the instructions to set up name and username (e.g. my_bot) for your bot.

			
 
				+# 4. You would get a token (e.g. 123:abcde) after step 3.

			
 
				+# 5. Visit telegram.me/<bot_username> (e.g. telegram.me/my_bot) and say hi to your bot to initiate a conversation.

			
 
				+# 6. Visit https://api.telegram.org/bot<token-in-setp-4>/getUpdates to get the chat_id.

			
 
				+#    (e.g. Visit https://api.telegram.org/bot123:abcde/getUpdates

			
 
				+#     and you can find the chat_id in "chat":{"id":123456789,...)

			
 
				+# See https://core.telegram.org/bots#6-botfather for more info

			
 
				+

			
 
				+# See step 1~4 above, e.g. '123:abcde'

			
 
				+TELEGRAM_TOKEN = os.environ.get('TELEGRAM_TOKEN', '')

			
 
				+# See step 5~6 above, e.g. 123456789

			
 
				+TELEGRAM_CHAT_ID = int(os.environ.get('TELEGRAM_CHAT_ID', 0))

			
 
				+

			
 
				+########## email ##########

			
 
				+# The default subject to use when sending text via email.

			
 
				+EMAIL_SUBJECT = 'Email from #scrapydweb'

			
 
				+

			
 
				+########## email sender & recipients ##########

			
 
				+# Leave this option as '' to default to the EMAIL_SENDER option below; Otherwise, set it up

			
 
				+# if your email service provider requires an username which is different from the EMAIL_SENDER option below to login.

			
 
				+# e.g. 'username'

			
 
				+EMAIL_USERNAME = ''

			
 
				+# As for different email service provider, you might have to get an APP password (like Gmail)

			
 
				+# or an authorization code (like QQ mail) and set it as the EMAIL_PASSWORD.

			
 
				+# Check out links below to get more help:

			
 
				+# https://stackoverflow.com/a/27515833/10517783 How to send an email with Gmail as the provider using Python?

			
 
				+# https://stackoverflow.com/a/26053352/10517783 Python smtplib proxy support

			
 
				+# e.g. 'password4gmail'

			
 
				+EMAIL_PASSWORD = os.environ.get('EMAIL_PASSWORD', '')

			
 
				+

			
 
				+# e.g. 'username@gmail.com'

			
 
				+EMAIL_SENDER = ''

			
 
				+# e.g. ['username@gmail.com', ]

			
 
				+EMAIL_RECIPIENTS = [EMAIL_SENDER]

			
 
				+

			
 
				+########## email smtp settings ##########

			
 
				+# Check out this link if you are using ECS of Alibaba Cloud and your SMTP server provides TCP port 25 only:

			
 
				+# https://www.alibabacloud.com/help/doc-detail/56130.htm

			
 
				+# Config for https://mail.google.com using SSL: ('smtp.gmail.com', 465, True)

			
 
				+# Config for https://mail.google.com:           ('smtp.gmail.com', 587, False)

			
 
				+# Config for https://mail.qq.com using SSL:     ('smtp.qq.com', 465, True)

			
 
				+# Config for http://mail.10086.cn:              ('smtp.139.com', 25, False)

			
 
				+SMTP_SERVER = ''

			
 
				+SMTP_PORT = 0

			
 
				+SMTP_OVER_SSL = False

			
 
				+# The timeout in seconds for the connection attempt, the default is 30.

			
 
				+SMTP_CONNECTION_TIMEOUT = 30

			
 
				+

			
 
				+

			
 
				+############################## Monitor & Alert ################################

			
 
				+# The default is False, set it to True to launch the poll subprocess to monitor your crawling jobs.

			
 
				+ENABLE_MONITOR = False

			
 
				+

			
 
				+########## poll interval ##########

			
 
				+# Tip: In order to be notified (and stop or forcestop a job when triggered) in time,

			
 
				+# you can reduce the value of POLL_ROUND_INTERVAL and POLL_REQUEST_INTERVAL,

			
 
				+# at the cost of burdening both CPU and bandwidth of your servers.

			
 
				+

			
 
				+# Sleep N seconds before starting next round of poll, the default is 300.

			
 
				+POLL_ROUND_INTERVAL = 300

			
 
				+# Sleep N seconds between each request to the Scrapyd server while polling, the default is 10.

			
 
				+POLL_REQUEST_INTERVAL = 10

			
 
				+

			
 
				+########## alert switcher ##########

			
 
				+# Tip: Set the SCRAPYDWEB_BIND option the in "QUICK SETUP" section to the actual IP of your host,

			
 
				+# then you can visit ScrapydWeb via the links attached in the alert.

			
 
				+

			
 
				+# The default is False, set it to True to enable alert via Slack, Telegram, or Email.

			
 
				+# You have to set up your accounts in the "Send text" section above first.

			
 
				+ENABLE_SLACK_ALERT = False

			
 
				+ENABLE_TELEGRAM_ALERT = False

			
 
				+ENABLE_EMAIL_ALERT = False

			
 
				+

			
 
				+########## alert working time ##########

			
 
				+# Monday is 1 and Sunday is 7.

			
 
				+# e.g, [1, 2, 3, 4, 5, 6, 7]

			
 
				+ALERT_WORKING_DAYS = []

			
 
				+

			
 
				+# From 0 to 23.

			
 
				+# e.g. [9] + list(range(15, 18)) >>> [9, 15, 16, 17], or range(24) for 24 hours

			
 
				+ALERT_WORKING_HOURS = []

			
 
				+

			
 
				+########## basic triggers ##########

			
 
				+# Trigger alert every N seconds for each running job.

			
 
				+# The default is 0, set it to a positive integer to enable this trigger.

			
 
				+ON_JOB_RUNNING_INTERVAL = 0

			
 
				+

			
 
				+# Trigger alert when a job is finished.

			
 
				+# The default is False, set it to True to enable this trigger.

			
 
				+ON_JOB_FINISHED = False

			
 
				+

			
 
				+########## advanced triggers ##########

			
 
				+# - LOG_XXX_THRESHOLD:

			
 
				+#   - Trigger alert the first time reaching the threshold for a specific kind of log.

			
 
				+#   - The default is 0, set it to a positive integer to enable this trigger.

			
 
				+# - LOG_XXX_TRIGGER_STOP (optional):

			
 
				+#   - The default is False, set it to True to stop current job automatically when reaching the LOG_XXX_THRESHOLD.

			
 
				+#   - The SIGTERM signal would be sent only one time to shut down the crawler gracefully.

			
 
				+#   - In order to avoid an UNCLEAN shutdown, the 'STOP' action would be executed one time at most

			
 
				+#   - if none of the 'FORCESTOP' triggers is enabled, no matter how many 'STOP' triggers are enabled.

			
 
				+# - LOG_XXX_TRIGGER_FORCESTOP (optional):

			
 
				+#   - The default is False, set it to True to FORCESTOP current job automatically when reaching the LOG_XXX_THRESHOLD.

			
 
				+#   - The SIGTERM signal would be sent twice resulting in an UNCLEAN shutdown, without the Scrapy stats dumped!

			
 
				+#   - The 'FORCESTOP' action would be executed if both of the 'STOP' and 'FORCESTOP' triggers are enabled.

			
 
				+

			
 
				+# Note that the 'STOP' action and the 'FORCESTOP' action would still be executed even when the current time

			
 
				+# is NOT within the ALERT_WORKING_DAYS and the ALERT_WORKING_HOURS, though no alert would be sent.

			
 
				+

			
 
				+LOG_CRITICAL_THRESHOLD = 0

			
 
				+LOG_CRITICAL_TRIGGER_STOP = False

			
 
				+LOG_CRITICAL_TRIGGER_FORCESTOP = False

			
 
				+

			
 
				+LOG_ERROR_THRESHOLD = 0

			
 
				+LOG_ERROR_TRIGGER_STOP = False

			
 
				+LOG_ERROR_TRIGGER_FORCESTOP = False

			
 
				+

			
 
				+LOG_WARNING_THRESHOLD = 0

			
 
				+LOG_WARNING_TRIGGER_STOP = False

			
 
				+LOG_WARNING_TRIGGER_FORCESTOP = False

			
 
				+

			
 
				+LOG_REDIRECT_THRESHOLD = 0

			
 
				+LOG_REDIRECT_TRIGGER_STOP = False

			
 
				+LOG_REDIRECT_TRIGGER_FORCESTOP = False

			
 
				+

			
 
				+LOG_RETRY_THRESHOLD = 0

			
 
				+LOG_RETRY_TRIGGER_STOP = False

			
 
				+LOG_RETRY_TRIGGER_FORCESTOP = False

			
 
				+

			
 
				+LOG_IGNORE_THRESHOLD = 0

			
 
				+LOG_IGNORE_TRIGGER_STOP = False

			
 
				+LOG_IGNORE_TRIGGER_FORCESTOP = False

			
 
				+

			
 
				+

			
 
				+############################## System #########################################

			
 
				+# The default is False, set it to True to enable debug mode and the interactive debugger

			
 
				+# would be shown in the browser instead of the "500 Internal Server Error" page.

			
 
				+# Note that use_reloader is set to False in run.py

			
 
				+DEBUG = False

			
 
				+

			
 
				+# The default is False, set it to True to change the logging level from INFO to DEBUG

			
 
				+# for getting more information about how ScrapydWeb works, especially while debugging.

			
 
				+VERBOSE = False

			
 
				+

			
 
				+# The default is '', which means saving all program data in the Python directory.

			
 
				+# e.g. 'C:/Users/username/scrapydweb_data' or '/home/username/scrapydweb_data'

			
 
				+DATA_PATH = os.environ.get('DATA_PATH', '')

			
 
				+

			
 
				+# The default is '', which means saving data of Jobs and Timer Tasks in DATA_PATH using SQLite.

			
 
				+# The data could be also saved in MySQL or PostgreSQL backend in order to improve concurrency.

			
 
				+# To use MySQL backend, run command: pip install --upgrade pymysql

			
 
				+# To use PostgreSQL backend, run command: pip install --upgrade psycopg2

			
 
				+# e.g.

			
 
				+# 'mysql://username:password@127.0.0.1:3306'

			
 
				+# 'postgres://username:password@127.0.0.1:5432'

			
 
				+# 'sqlite:///C:/Users/username'

			
 
				+# 'sqlite:////home/username'

			
 
				+DATABASE_URL = os.environ.get('DATABASE_URL', '')

			
--- a/spiders.conf
+++ b/spiders.conf
@@ -0,0 +1,9 @@
 
				+bjx
			
 
				+cecn
			
 
				+ceec
			
 
				+ceeia
			
 
				+chinapower
			
 
				+chinapv
			
 
				+chng
			
 
				+cnen
			
 
				+cnnpn','cny','cpnn','csg','ctg','cweea','eptc','escn','ewindpower','gxepa','iesplaza','nengyuanjie','newenergy','piec','powerchina','solarbe','solarenpv','sungrow','twea','xhhydropower','zzsolar']
			
--- a/start.sh
+++ b/start.sh
@@ -0,0 +1,7 @@
 
				+#!/bin/sh
			
 
				+nohup scrapyd > scrapyd.log 2>&1 &
			
 
				+export Back_End_Ip=192.168.1.203
			
 
				+export Back_End_port=11031
			
 
				+export ProjectName=electric
			
 
				+
			
 
				+./timertask.py
			
--- a/timertask.py
+++ b/timertask.py
@@ -0,0 +1,57 @@
 
				+#!/usr/local/bin/python
			
 
				+# -*- coding: utf-8 -*-
			
 
				+# @Last Modified time: 2022-02-24 09:43:13
			
 
				+# 
			
 
				+# 爬虫批量定时任务
			
 
				+
			
 
				+import os
			
 
				+import json
			
 
				+import time
			
 
				+import datetime
			
 
				+import logging
			
 
				+import requests
			
 
				+from requests.adapters import HTTPAdapter
			
 
				+
			
 
				+from apscheduler.schedulers.blocking import BlockingScheduler
			
 
				+from apscheduler.events import EVENT_JOB_EXECUTED, EVENT_JOB_ERROR
			
 
				+
			
 
				+
			
 
				+logging.basicConfig(level=logging.INFO,
			
 
				+	filename='timertask.log',
			
 
				+	format='%(asctime)s:%(levelname)s:%(message)s'
			
 
				+)
			
 
				+
			
 
				+sched = BlockingScheduler(timezone="Asia/Shanghai")
			
 
				+spiderlist = ['bjx','cecn','ceec','ceeia','chinapower','chinapv','chng','cnen','cnnpn','cny','cpnn','csg','ctg','cweea','eptc','escn','ewindpower','gxepa','iesplaza','nengyuanjie','newenergy','piec','powerchina','solarbe','solarenpv','sungrow','twea','xhhydropower','zzsolar']
			
 
				+
			
 
				+# 从后端获取爬虫列表
			
 
				+def get_spiders():
			
 
				+    # 后端 ip
			
 
				+    ip = os.environ.get("Back_End_Ip", "192.168.1.203")
			
 
				+    # 后端 port
			
 
				+    port = os.environ.get("Back_End_Port", 11031)
			
 
				+    # 请求后端数据库
			
 
				+    url = 'http://{}:{}/resource/judge'.format(ip,port)
			
 
				+    session = requests.Session()
			
 
				+    session.mount('http://', HTTPAdapter(max_retries = 3))
			
 
				+    try:
			
 
				+    	response = session.get(url, timeout=10)
			
 
				+        # 返回运行列表
			
 
				+        return json.loads(response.text)['data']['running']
			
 
				+    except requests.exceptions.RequestException as e:
			
 
				+        print(e)
			
 
				+
			
 
				+# 运行任务
			
 
				+@sched.scheduled_job('cron', hour=1)
			
 
				+def spiders_job():
			
 
				+    # 获取运行列表
			
 
				+    spiders = get_spiders()
			
 
				+    # 执行任务
			
 
				+    for spider in spiders:
			
 
				+        if spider in spiderlist:
			
 
				+            data = {'project':os.environ.get("ProjectName", ""),'spider':spider,'jobid':datetime.datetime.now().strftime("%Y-%m-%dT%H_%M_%S")}
			
 
				+            response = requests.post(url='http://localhost:6800/schedule.json', data=data)
			
 
				+            logging.info(response.text)
			
 
				+            time.sleep(2)
			
 
				+
			
 
				+sched.start()