rss_feed.py 1.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253
  1. import hashlib
  2. from embedchain.helpers.json_serializable import register_deserializable
  3. from embedchain.loaders.base_loader import BaseLoader
  4. @register_deserializable
  5. class RSSFeedLoader(BaseLoader):
  6. """Loader for RSS Feed."""
  7. def load_data(self, url):
  8. """Load data from a rss feed."""
  9. output = self.get_rss_content(url)
  10. doc_id = hashlib.sha256((str(output) + url).encode()).hexdigest()
  11. return {
  12. "doc_id": doc_id,
  13. "data": output,
  14. }
  15. @staticmethod
  16. def serialize_metadata(metadata):
  17. for key, value in metadata.items():
  18. if not isinstance(value, (str, int, float, bool)):
  19. metadata[key] = str(value)
  20. return metadata
  21. @staticmethod
  22. def get_rss_content(url: str):
  23. try:
  24. from langchain_community.document_loaders import \
  25. RSSFeedLoader as LangchainRSSFeedLoader
  26. except ImportError:
  27. raise ImportError(
  28. """RSSFeedLoader file requires extra dependencies.
  29. Install with `pip install --upgrade "embedchain[rss_feed]"`"""
  30. ) from None
  31. output = []
  32. loader = LangchainRSSFeedLoader(urls=[url])
  33. data = loader.load()
  34. for entry in data:
  35. meta_data = RSSFeedLoader.serialize_metadata(entry.metadata)
  36. meta_data.update({"url": url})
  37. output.append(
  38. {
  39. "content": entry.page_content,
  40. "meta_data": meta_data,
  41. }
  42. )
  43. return output