slack.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
  1. import hashlib
  2. import logging
  3. import os
  4. import ssl
  5. from typing import Any, Dict, Optional
  6. import certifi
  7. from embedchain.loaders.base_loader import BaseLoader
  8. from embedchain.utils import clean_string
  9. SLACK_API_BASE_URL = "https://www.slack.com/api/"
  10. class SlackLoader(BaseLoader):
  11. def __init__(self, config: Optional[Dict[str, Any]] = None):
  12. super().__init__()
  13. if config is not None:
  14. self.config = config
  15. else:
  16. self.config = {"base_url": SLACK_API_BASE_URL}
  17. self.client = None
  18. self._setup_loader(self.config)
  19. def _setup_loader(self, config: Dict[str, Any]):
  20. try:
  21. from slack_sdk import WebClient
  22. except ImportError as e:
  23. raise ImportError(
  24. "Slack loader requires extra dependencies. \
  25. Install with `pip install --upgrade embedchain[slack]`"
  26. ) from e
  27. if os.getenv("SLACK_USER_TOKEN") is None:
  28. raise ValueError(
  29. "SLACK_USER_TOKEN environment variables not provided. Check `https://docs.embedchain.ai/data-sources/slack` to learn more." # noqa:E501
  30. )
  31. logging.info(f"Creating Slack Loader with config: {config}")
  32. # get slack client config params
  33. slack_bot_token = os.getenv("SLACK_USER_TOKEN")
  34. ssl_cert = ssl.create_default_context(cafile=certifi.where())
  35. base_url = config.get("base_url", SLACK_API_BASE_URL)
  36. headers = config.get("headers")
  37. # for Org-Wide App
  38. team_id = config.get("team_id")
  39. self.client = WebClient(
  40. token=slack_bot_token,
  41. base_url=base_url,
  42. ssl=ssl_cert,
  43. headers=headers,
  44. team_id=team_id,
  45. )
  46. logging.info("Slack Loader setup successful!")
  47. def _check_query(self, query):
  48. if not isinstance(query, str):
  49. raise ValueError(
  50. f"Invalid query passed to Slack loader, found: {query}. Check `https://docs.embedchain.ai/data-sources/slack` to learn more." # noqa:E501
  51. )
  52. def load_data(self, query):
  53. self._check_query(query)
  54. try:
  55. data = []
  56. data_content = []
  57. logging.info(f"Searching slack conversations for query: {query}")
  58. results = self.client.search_messages(
  59. query=query,
  60. sort="timestamp",
  61. sort_dir="desc",
  62. count=1000,
  63. )
  64. messages = results.get("messages")
  65. num_message = results.get("total")
  66. logging.info(f"Found {num_message} messages for query: {query}")
  67. matches = messages.get("matches", [])
  68. for message in matches:
  69. url = message.get("permalink")
  70. text = message.get("text")
  71. content = clean_string(text)
  72. message_meta_data_keys = ["channel", "iid", "team", "ts", "type", "user", "username"]
  73. meta_data = message.fromkeys(message_meta_data_keys, "")
  74. meta_data.update({"url": url})
  75. data.append(
  76. {
  77. "content": content,
  78. "meta_data": meta_data,
  79. }
  80. )
  81. data_content.append(content)
  82. doc_id = hashlib.md5((query + ", ".join(data_content)).encode()).hexdigest()
  83. return {
  84. "doc_id": doc_id,
  85. "data": data,
  86. }
  87. except Exception as e:
  88. logging.warning(f"Error in loading slack data: {e}")
  89. raise ValueError(
  90. f"Error in loading slack data: {e}. Check `https://docs.embedchain.ai/data-sources/slack` to learn more." # noqa:E501
  91. ) from e