123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657 |
- #!/usr/local/bin/python
- # -*- coding: utf-8 -*-
- # @Last Modified time: 2022-02-24 09:43:13
- #
- # 爬虫批量定时任务
- import os
- import json
- import time
- import datetime
- import logging
- import requests
- from requests.adapters import HTTPAdapter
- from apscheduler.schedulers.blocking import BlockingScheduler
- from apscheduler.events import EVENT_JOB_EXECUTED, EVENT_JOB_ERROR
- logging.basicConfig(level=logging.INFO,
- filename='timertask.log',
- format='%(asctime)s:%(levelname)s:%(message)s'
- )
- sched = BlockingScheduler(timezone="Asia/Shanghai")
- spiderlist = ['bjx','cecn','ceec','ceeia','chinapower','chinapv','chng','cnen','cnnpn','cny','cpnn','csg','ctg','cweea','eptc','escn','ewindpower','gxepa','iesplaza','nengyuanjie','newenergy','piec','powerchina','solarbe','solarenpv','sungrow','twea','xhhydropower','zzsolar']
- # 从后端获取爬虫列表
- def get_spiders():
- # 后端 ip
- ip = os.environ.get("Back_End_Ip", "192.168.1.203")
- # 后端 port
- port = os.environ.get("Back_End_Port", 11031)
- # 请求后端数据库
- url = 'http://{}:{}/resource/judge'.format(ip,port)
- session = requests.Session()
- session.mount('http://', HTTPAdapter(max_retries = 3))
- try:
- response = session.get(url, timeout=10)
- # 返回运行列表
- return json.loads(response.text)['data']['running']
- except requests.exceptions.RequestException as e:
- print(e)
- # 运行任务
- @sched.scheduled_job('cron', hour=1)
- def spiders_job():
- # 获取运行列表
- spiders = get_spiders()
- # 执行任务
- for spider in spiders:
- if spider in spiderlist:
- data = {'project':os.environ.get("ProjectName", ""),'spider':spider,'jobid':datetime.datetime.now().strftime("%Y-%m-%dT%H_%M_%S")}
- response = requests.post(url='http://localhost:6800/schedule.json', data=data)
- logging.info(response.text)
- time.sleep(2)
- sched.start()
|