data_formatter.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
  1. from embedchain.chunkers.docs_site import DocsSiteChunker
  2. from embedchain.chunkers.docx_file import DocxFileChunker
  3. from embedchain.chunkers.notion import NotionChunker
  4. from embedchain.chunkers.pdf_file import PdfFileChunker
  5. from embedchain.chunkers.qna_pair import QnaPairChunker
  6. from embedchain.chunkers.text import TextChunker
  7. from embedchain.chunkers.web_page import WebPageChunker
  8. from embedchain.chunkers.youtube_video import YoutubeVideoChunker
  9. from embedchain.config import AddConfig
  10. from embedchain.loaders.docs_site_loader import DocsSiteLoader
  11. from embedchain.loaders.docx_file import DocxFileLoader
  12. from embedchain.loaders.local_qna_pair import LocalQnaPairLoader
  13. from embedchain.loaders.local_text import LocalTextLoader
  14. from embedchain.loaders.notion import NotionLoader
  15. from embedchain.loaders.pdf_file import PdfFileLoader
  16. from embedchain.loaders.sitemap import SitemapLoader
  17. from embedchain.loaders.web_page import WebPageLoader
  18. from embedchain.loaders.youtube_video import YoutubeVideoLoader
  19. class DataFormatter:
  20. """
  21. DataFormatter is an internal utility class which abstracts the mapping for
  22. loaders and chunkers to the data_type entered by the user in their
  23. .add or .add_local method call
  24. """
  25. def __init__(self, data_type: str, config: AddConfig):
  26. self.loader = self._get_loader(data_type, config.loader)
  27. self.chunker = self._get_chunker(data_type, config.chunker)
  28. def _get_loader(self, data_type, config):
  29. """
  30. Returns the appropriate data loader for the given data type.
  31. :param data_type: The type of the data to load.
  32. :return: The loader for the given data type.
  33. :raises ValueError: If an unsupported data type is provided.
  34. """
  35. loaders = {
  36. "youtube_video": YoutubeVideoLoader(),
  37. "pdf_file": PdfFileLoader(),
  38. "web_page": WebPageLoader(),
  39. "qna_pair": LocalQnaPairLoader(),
  40. "text": LocalTextLoader(),
  41. "docx": DocxFileLoader(),
  42. "sitemap": SitemapLoader(),
  43. "docs_site": DocsSiteLoader(),
  44. "notion": NotionLoader(),
  45. }
  46. if data_type in loaders:
  47. return loaders[data_type]
  48. else:
  49. raise ValueError(f"Unsupported data type: {data_type}")
  50. def _get_chunker(self, data_type, config):
  51. """
  52. Returns the appropriate chunker for the given data type.
  53. :param data_type: The type of the data to chunk.
  54. :return: The chunker for the given data type.
  55. :raises ValueError: If an unsupported data type is provided.
  56. """
  57. chunker_classes = {
  58. "youtube_video": YoutubeVideoChunker,
  59. "pdf_file": PdfFileChunker,
  60. "web_page": WebPageChunker,
  61. "qna_pair": QnaPairChunker,
  62. "text": TextChunker,
  63. "docx": DocxFileChunker,
  64. "sitemap": WebPageChunker,
  65. "docs_site": DocsSiteChunker,
  66. "notion": NotionChunker,
  67. }
  68. if data_type in chunker_classes:
  69. chunker_class = chunker_classes[data_type]
  70. chunker = chunker_class(config)
  71. chunker.set_data_type(data_type)
  72. return chunker
  73. else:
  74. raise ValueError(f"Unsupported data type: {data_type}")