rss_feed.py 1.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152
  1. import hashlib
  2. from embedchain.helper.json_serializable import register_deserializable
  3. from embedchain.loaders.base_loader import BaseLoader
  4. @register_deserializable
  5. class RSSFeedLoader(BaseLoader):
  6. """Loader for RSS Feed."""
  7. def load_data(self, url):
  8. """Load data from a rss feed."""
  9. output = self.get_rss_content(url)
  10. doc_id = hashlib.sha256((str(output) + url).encode()).hexdigest()
  11. return {
  12. "doc_id": doc_id,
  13. "data": output,
  14. }
  15. @staticmethod
  16. def serialize_metadata(metadata):
  17. for key, value in metadata.items():
  18. if not isinstance(value, (str, int, float, bool)):
  19. metadata[key] = str(value)
  20. return metadata
  21. @staticmethod
  22. def get_rss_content(url: str):
  23. try:
  24. from langchain.document_loaders import RSSFeedLoader as LangchainRSSFeedLoader
  25. except ImportError:
  26. raise ImportError(
  27. """RSSFeedLoader file requires extra dependencies.
  28. Install with `pip install --upgrade "embedchain[rss_feed]"`"""
  29. ) from None
  30. output = []
  31. loader = LangchainRSSFeedLoader(urls=[url])
  32. data = loader.load()
  33. for entry in data:
  34. meta_data = RSSFeedLoader.serialize_metadata(entry.metadata)
  35. meta_data.update({"url": url})
  36. output.append(
  37. {
  38. "content": entry.page_content,
  39. "meta_data": meta_data,
  40. }
  41. )
  42. return output