scrapydweb_settings_v10.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358
  1. # coding: utf-8
  2. """
  3. How ScrapydWeb works:
  4. BROWSER <<<>>> SCRAPYDWEB_BIND:SCRAPYDWEB_PORT <<<>>> your SCRAPYD_SERVERS
  5. GitHub: https://github.com/my8100/scrapydweb
  6. DOCS: https://github.com/my8100/files/blob/master/scrapydweb/README.md
  7. 文档:https://github.com/my8100/files/blob/master/scrapydweb/README_CN.md
  8. """
  9. import os
  10. ############################## QUICK SETUP start ##############################
  11. ############################## 快速设置 开始 ###################################
  12. # Setting SCRAPYDWEB_BIND to '0.0.0.0' or IP-OF-THE-CURRENT-HOST would make
  13. # ScrapydWeb server visible externally; Otherwise, set it to '127.0.0.1'.
  14. # The default is '0.0.0.0'.
  15. SCRAPYDWEB_BIND = '0.0.0.0'
  16. # Accept connections on the specified port, the default is 5000.
  17. SCRAPYDWEB_PORT = 5000
  18. # The default is False, set it to True to enable basic auth for the web UI.
  19. ENABLE_AUTH = False
  20. # In order to enable basic auth, both USERNAME and PASSWORD should be non-empty strings.
  21. USERNAME = ''
  22. PASSWORD = ''
  23. # Make sure that [Scrapyd](https://github.com/scrapy/scrapyd) has been installed
  24. # and started on all of your hosts.
  25. # Note that for remote access, you have to manually set 'bind_address = 0.0.0.0'
  26. # in the configuration file of Scrapyd and restart Scrapyd to make it visible externally.
  27. # Check out 'https://scrapyd.readthedocs.io/en/latest/config.html#example-configuration-file' for more info.
  28. # ------------------------------ Chinese --------------------------------------
  29. # 请先确保所有主机都已经安装和启动 [Scrapyd](https://github.com/scrapy/scrapyd)。
  30. # 如需远程访问 Scrapyd,则需在 Scrapyd 配置文件中设置 'bind_address = 0.0.0.0',然后重启 Scrapyd。
  31. # 详见 https://scrapyd.readthedocs.io/en/latest/config.html#example-configuration-file
  32. # - the string format: username:password@ip:port#group
  33. # - The default port would be 6800 if not provided,
  34. # - Both basic auth and group are optional.
  35. # - e.g. '127.0.0.1:6800' or 'username:password@localhost:6801#group'
  36. # - the tuple format: (username, password, ip, port, group)
  37. # - When the username, password, or group is too complicated (e.g. contains ':@#'),
  38. # - or if ScrapydWeb fails to parse the string format passed in,
  39. # - it's recommended to pass in a tuple of 5 elements.
  40. # - e.g. ('', '', '127.0.0.1', '6800', '') or ('username', 'password', 'localhost', '6801', 'group')
  41. SCRAPYD_SERVERS = [
  42. '127.0.0.1:6800',
  43. # 'username:password@localhost:6801#group',
  44. # ('username', 'password', 'localhost', '6801', 'group'),
  45. ]
  46. # It's recommended to update the three options below
  47. # if both ScrapydWeb and one of your Scrapyd servers run on the same machine.
  48. # ------------------------------ Chinese --------------------------------------
  49. # 假如 ScrapydWeb 和某个 Scrapyd 运行于同一台主机,建议更新如下三个设置项。
  50. # If both ScrapydWeb and one of your Scrapyd servers run on the same machine,
  51. # ScrapydWeb would try to directly read Scrapy logfiles from disk, instead of making a request
  52. # to the Scrapyd server.
  53. # e.g. '127.0.0.1:6800' or 'localhost:6801', do not forget the port number.
  54. LOCAL_SCRAPYD_SERVER = '127.0.0.1:6800'
  55. # Enter the directory when you run Scrapyd, run the command below
  56. # to find out where the Scrapy logs are stored:
  57. # python -c "from os.path import abspath, isdir; from scrapyd.config import Config; path = abspath(Config().get('logs_dir')); print(path); print(isdir(path))"
  58. # Check out https://scrapyd.readthedocs.io/en/stable/config.html#logs-dir for more info.
  59. # e.g. 'C:/Users/username/logs' or '/home/username/logs'
  60. LOCAL_SCRAPYD_LOGS_DIR = './logs'
  61. # The default is False, set it to True to automatically run LogParser as a subprocess at startup.
  62. # Note that you can run the LogParser service separately via command 'logparser' as you like.
  63. # Run 'logparser -h' to find out the config file of LogParser for more advanced settings.
  64. # Visit https://github.com/my8100/logparser for more info.
  65. ENABLE_LOGPARSER = True
  66. ############################## QUICK SETUP end ################################
  67. ############################## 快速设置 结束 ###################################
  68. ############################## ScrapydWeb #####################################
  69. # The default is False, set it to True and add both CERTIFICATE_FILEPATH and PRIVATEKEY_FILEPATH
  70. # to run ScrapydWeb in HTTPS mode.
  71. # Note that this feature is not fully tested, please leave your comment here if ScrapydWeb
  72. # raises any excepion at startup: https://github.com/my8100/scrapydweb/issues/18
  73. ENABLE_HTTPS = False
  74. # e.g. '/home/username/cert.pem'
  75. CERTIFICATE_FILEPATH = ''
  76. # e.g. '/home/username/cert.key'
  77. PRIVATEKEY_FILEPATH = ''
  78. ############################## Scrapy #########################################
  79. # ScrapydWeb is able to locate projects in the SCRAPY_PROJECTS_DIR,
  80. # so that you can simply select a project to deploy, instead of packaging it in advance.
  81. # e.g. 'C:/Users/username/myprojects' or '/home/username/myprojects'
  82. SCRAPY_PROJECTS_DIR = './projects'
  83. ############################## Scrapyd ########################################
  84. # ScrapydWeb would try every extension in sequence to locate the Scrapy logfile.
  85. # The default is ['.log', '.log.gz', '.txt'].
  86. SCRAPYD_LOG_EXTENSIONS = ['.log', '.log.gz', '.txt']
  87. ############################## LogParser ######################################
  88. # Whether to backup the stats json files locally after you visit the Stats page of a job
  89. # so that it is still accessible even if the original logfile has been deleted.
  90. # The default is True, set it to False to disable this behaviour.
  91. BACKUP_STATS_JSON_FILE = True
  92. ############################## Timer Tasks ####################################
  93. # Run ScrapydWeb with argument '-sw' or '--switch_scheduler_state', or click the ENABLED|DISABLED button
  94. # on the Timer Tasks page to turn on/off the scheduler for the timer tasks and the snapshot mechanism below.
  95. # The default is 300, which means ScrapydWeb would automatically create a snapshot of the Jobs page
  96. # and save the jobs info in the database in the background every 300 seconds.
  97. # Note that this behaviour would be paused if the scheduler for timer tasks is disabled.
  98. # Set it to 0 to disable this behaviour.
  99. JOBS_SNAPSHOT_INTERVAL = 300
  100. ############################## Run Spider #####################################
  101. # The default is False, set it to True to automatically
  102. # expand the 'settings & arguments' section in the Run Spider page.
  103. SCHEDULE_EXPAND_SETTINGS_ARGUMENTS = False
  104. # The default is 'Mozilla/5.0', set it a non-empty string to customize the default value of `custom`
  105. # in the drop-down list of `USER_AGENT`.
  106. SCHEDULE_CUSTOM_USER_AGENT = 'Mozilla/5.0'
  107. # The default is None, set it to any value of ['custom', 'Chrome', 'iPhone', 'iPad', 'Android']
  108. # to customize the default value of `USER_AGENT`.
  109. SCHEDULE_USER_AGENT = None
  110. # The default is None, set it to True or False to customize the default value of `ROBOTSTXT_OBEY`.
  111. SCHEDULE_ROBOTSTXT_OBEY = None
  112. # The default is None, set it to True or False to customize the default value of `COOKIES_ENABLED`.
  113. SCHEDULE_COOKIES_ENABLED = None
  114. # The default is None, set it to a non-negative integer to customize the default value of `CONCURRENT_REQUESTS`.
  115. SCHEDULE_CONCURRENT_REQUESTS = None
  116. # The default is None, set it to a non-negative number to customize the default value of `DOWNLOAD_DELAY`.
  117. SCHEDULE_DOWNLOAD_DELAY = 5
  118. # The default is "-d setting=CLOSESPIDER_TIMEOUT=60\r\n-d setting=CLOSESPIDER_PAGECOUNT=10\r\n-d arg1=val1",
  119. # set it to '' or any non-empty string to customize the default value of `additional`.
  120. # Use '\r\n' as the line separator.
  121. SCHEDULE_ADDITIONAL = "-d setting=CLOSESPIDER_TIMEOUT=60\r\n-d setting=CLOSESPIDER_PAGECOUNT=10\r\n-d arg1=val1"
  122. ############################## Page Display ###################################
  123. # The default is True, set it to False to hide the Items page, as well as
  124. # the Items column in the Jobs page.
  125. SHOW_SCRAPYD_ITEMS = True
  126. # The default is True, set it to False to hide the Job column in the Jobs page with non-database view.
  127. SHOW_JOBS_JOB_COLUMN = True
  128. # The default is 0, which means unlimited, set it to a positive integer so that
  129. # only the latest N finished jobs would be shown in the Jobs page with non-database view.
  130. JOBS_FINISHED_JOBS_LIMIT = 0
  131. # If your browser stays on the Jobs page, it would be reloaded automatically every N seconds.
  132. # The default is 300, set it to 0 to disable auto-reloading.
  133. JOBS_RELOAD_INTERVAL = 300
  134. # The load status of the current Scrapyd server is checked every N seconds,
  135. # which is displayed in the top right corner of the page.
  136. # The default is 10, set it to 0 to disable auto-refreshing.
  137. DAEMONSTATUS_REFRESH_INTERVAL = 10
  138. ############################## Send Text ######################################
  139. ########## usage in scrapy projects ##########
  140. # See the "Send Text" page
  141. ########## slack ##########
  142. # How to create a slack app:
  143. # 1. Visit https://api.slack.com/apps and press the "Create New App" button.
  144. # 2. Enter your App Name (e.g. myapp)and select one of your Slack Workspaces, the press "Create App".
  145. # 3. Click the "OAuth & Permissions" menu in the sidebar on the left side of the page.
  146. # 4. Scroll down the page and find out "Select Permission Scopes" in the "Scopes" section
  147. # 5. Enter "send" and select "Send messages as <your-app-name>", then press "Save Changes"
  148. # 6. Scroll up the page and press "Install App to Workspace", then press "Install"
  149. # 7. Copy the "OAuth Access Token", e.g. xoxp-123-456-789-abcde
  150. # See https://api.slack.com/apps for more info
  151. # See step 1~7 above, e.g. 'xoxp-123-456-789-abcde'
  152. SLACK_TOKEN = os.environ.get('SLACK_TOKEN', '')
  153. # The default channel to use when sending text via slack, e.g. 'general'
  154. SLACK_CHANNEL = 'general'
  155. ########## telegram ##########
  156. # How to create a telegram bot:
  157. # 1. Visit https://telegram.me/botfather to start a conversation with Telegram's bot that creates other bots.
  158. # 2. Send the /newbot command to create a new bot in a chat with BotFather.
  159. # 3. Follow the instructions to set up name and username (e.g. my_bot) for your bot.
  160. # 4. You would get a token (e.g. 123:abcde) after step 3.
  161. # 5. Visit telegram.me/<bot_username> (e.g. telegram.me/my_bot) and say hi to your bot to initiate a conversation.
  162. # 6. Visit https://api.telegram.org/bot<token-in-setp-4>/getUpdates to get the chat_id.
  163. # (e.g. Visit https://api.telegram.org/bot123:abcde/getUpdates
  164. # and you can find the chat_id in "chat":{"id":123456789,...)
  165. # See https://core.telegram.org/bots#6-botfather for more info
  166. # See step 1~4 above, e.g. '123:abcde'
  167. TELEGRAM_TOKEN = os.environ.get('TELEGRAM_TOKEN', '')
  168. # See step 5~6 above, e.g. 123456789
  169. TELEGRAM_CHAT_ID = int(os.environ.get('TELEGRAM_CHAT_ID', 0))
  170. ########## email ##########
  171. # The default subject to use when sending text via email.
  172. EMAIL_SUBJECT = 'Email from #scrapydweb'
  173. ########## email sender & recipients ##########
  174. # Leave this option as '' to default to the EMAIL_SENDER option below; Otherwise, set it up
  175. # if your email service provider requires an username which is different from the EMAIL_SENDER option below to login.
  176. # e.g. 'username'
  177. EMAIL_USERNAME = ''
  178. # As for different email service provider, you might have to get an APP password (like Gmail)
  179. # or an authorization code (like QQ mail) and set it as the EMAIL_PASSWORD.
  180. # Check out links below to get more help:
  181. # https://stackoverflow.com/a/27515833/10517783 How to send an email with Gmail as the provider using Python?
  182. # https://stackoverflow.com/a/26053352/10517783 Python smtplib proxy support
  183. # e.g. 'password4gmail'
  184. EMAIL_PASSWORD = os.environ.get('EMAIL_PASSWORD', '')
  185. # e.g. 'username@gmail.com'
  186. EMAIL_SENDER = ''
  187. # e.g. ['username@gmail.com', ]
  188. EMAIL_RECIPIENTS = [EMAIL_SENDER]
  189. ########## email smtp settings ##########
  190. # Check out this link if you are using ECS of Alibaba Cloud and your SMTP server provides TCP port 25 only:
  191. # https://www.alibabacloud.com/help/doc-detail/56130.htm
  192. # Config for https://mail.google.com using SSL: ('smtp.gmail.com', 465, True)
  193. # Config for https://mail.google.com: ('smtp.gmail.com', 587, False)
  194. # Config for https://mail.qq.com using SSL: ('smtp.qq.com', 465, True)
  195. # Config for http://mail.10086.cn: ('smtp.139.com', 25, False)
  196. SMTP_SERVER = ''
  197. SMTP_PORT = 0
  198. SMTP_OVER_SSL = False
  199. # The timeout in seconds for the connection attempt, the default is 30.
  200. SMTP_CONNECTION_TIMEOUT = 30
  201. ############################## Monitor & Alert ################################
  202. # The default is False, set it to True to launch the poll subprocess to monitor your crawling jobs.
  203. ENABLE_MONITOR = False
  204. ########## poll interval ##########
  205. # Tip: In order to be notified (and stop or forcestop a job when triggered) in time,
  206. # you can reduce the value of POLL_ROUND_INTERVAL and POLL_REQUEST_INTERVAL,
  207. # at the cost of burdening both CPU and bandwidth of your servers.
  208. # Sleep N seconds before starting next round of poll, the default is 300.
  209. POLL_ROUND_INTERVAL = 300
  210. # Sleep N seconds between each request to the Scrapyd server while polling, the default is 10.
  211. POLL_REQUEST_INTERVAL = 10
  212. ########## alert switcher ##########
  213. # Tip: Set the SCRAPYDWEB_BIND option the in "QUICK SETUP" section to the actual IP of your host,
  214. # then you can visit ScrapydWeb via the links attached in the alert.
  215. # The default is False, set it to True to enable alert via Slack, Telegram, or Email.
  216. # You have to set up your accounts in the "Send text" section above first.
  217. ENABLE_SLACK_ALERT = False
  218. ENABLE_TELEGRAM_ALERT = False
  219. ENABLE_EMAIL_ALERT = False
  220. ########## alert working time ##########
  221. # Monday is 1 and Sunday is 7.
  222. # e.g, [1, 2, 3, 4, 5, 6, 7]
  223. ALERT_WORKING_DAYS = []
  224. # From 0 to 23.
  225. # e.g. [9] + list(range(15, 18)) >>> [9, 15, 16, 17], or range(24) for 24 hours
  226. ALERT_WORKING_HOURS = []
  227. ########## basic triggers ##########
  228. # Trigger alert every N seconds for each running job.
  229. # The default is 0, set it to a positive integer to enable this trigger.
  230. ON_JOB_RUNNING_INTERVAL = 0
  231. # Trigger alert when a job is finished.
  232. # The default is False, set it to True to enable this trigger.
  233. ON_JOB_FINISHED = False
  234. ########## advanced triggers ##########
  235. # - LOG_XXX_THRESHOLD:
  236. # - Trigger alert the first time reaching the threshold for a specific kind of log.
  237. # - The default is 0, set it to a positive integer to enable this trigger.
  238. # - LOG_XXX_TRIGGER_STOP (optional):
  239. # - The default is False, set it to True to stop current job automatically when reaching the LOG_XXX_THRESHOLD.
  240. # - The SIGTERM signal would be sent only one time to shut down the crawler gracefully.
  241. # - In order to avoid an UNCLEAN shutdown, the 'STOP' action would be executed one time at most
  242. # - if none of the 'FORCESTOP' triggers is enabled, no matter how many 'STOP' triggers are enabled.
  243. # - LOG_XXX_TRIGGER_FORCESTOP (optional):
  244. # - The default is False, set it to True to FORCESTOP current job automatically when reaching the LOG_XXX_THRESHOLD.
  245. # - The SIGTERM signal would be sent twice resulting in an UNCLEAN shutdown, without the Scrapy stats dumped!
  246. # - The 'FORCESTOP' action would be executed if both of the 'STOP' and 'FORCESTOP' triggers are enabled.
  247. # Note that the 'STOP' action and the 'FORCESTOP' action would still be executed even when the current time
  248. # is NOT within the ALERT_WORKING_DAYS and the ALERT_WORKING_HOURS, though no alert would be sent.
  249. LOG_CRITICAL_THRESHOLD = 0
  250. LOG_CRITICAL_TRIGGER_STOP = False
  251. LOG_CRITICAL_TRIGGER_FORCESTOP = False
  252. LOG_ERROR_THRESHOLD = 0
  253. LOG_ERROR_TRIGGER_STOP = False
  254. LOG_ERROR_TRIGGER_FORCESTOP = False
  255. LOG_WARNING_THRESHOLD = 0
  256. LOG_WARNING_TRIGGER_STOP = False
  257. LOG_WARNING_TRIGGER_FORCESTOP = False
  258. LOG_REDIRECT_THRESHOLD = 0
  259. LOG_REDIRECT_TRIGGER_STOP = False
  260. LOG_REDIRECT_TRIGGER_FORCESTOP = False
  261. LOG_RETRY_THRESHOLD = 0
  262. LOG_RETRY_TRIGGER_STOP = False
  263. LOG_RETRY_TRIGGER_FORCESTOP = False
  264. LOG_IGNORE_THRESHOLD = 0
  265. LOG_IGNORE_TRIGGER_STOP = False
  266. LOG_IGNORE_TRIGGER_FORCESTOP = False
  267. ############################## System #########################################
  268. # The default is False, set it to True to enable debug mode and the interactive debugger
  269. # would be shown in the browser instead of the "500 Internal Server Error" page.
  270. # Note that use_reloader is set to False in run.py
  271. DEBUG = False
  272. # The default is False, set it to True to change the logging level from INFO to DEBUG
  273. # for getting more information about how ScrapydWeb works, especially while debugging.
  274. VERBOSE = False
  275. # The default is '', which means saving all program data in the Python directory.
  276. # e.g. 'C:/Users/username/scrapydweb_data' or '/home/username/scrapydweb_data'
  277. DATA_PATH = os.environ.get('DATA_PATH', '')
  278. # The default is '', which means saving data of Jobs and Timer Tasks in DATA_PATH using SQLite.
  279. # The data could be also saved in MySQL or PostgreSQL backend in order to improve concurrency.
  280. # To use MySQL backend, run command: pip install --upgrade pymysql
  281. # To use PostgreSQL backend, run command: pip install --upgrade psycopg2
  282. # e.g.
  283. # 'mysql://username:password@127.0.0.1:3306'
  284. # 'postgres://username:password@127.0.0.1:5432'
  285. # 'sqlite:///C:/Users/username'
  286. # 'sqlite:////home/username'
  287. DATABASE_URL = os.environ.get('DATABASE_URL', '')