data_formatter.py 2.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. from embedchain.chunkers.docx_file import DocxFileChunker
  2. from embedchain.chunkers.pdf_file import PdfFileChunker
  3. from embedchain.chunkers.qna_pair import QnaPairChunker
  4. from embedchain.chunkers.text import TextChunker
  5. from embedchain.chunkers.web_page import WebPageChunker
  6. from embedchain.chunkers.youtube_video import YoutubeVideoChunker
  7. from embedchain.config import AddConfig
  8. from embedchain.loaders.docx_file import DocxFileLoader
  9. from embedchain.loaders.local_qna_pair import LocalQnaPairLoader
  10. from embedchain.loaders.local_text import LocalTextLoader
  11. from embedchain.loaders.pdf_file import PdfFileLoader
  12. from embedchain.loaders.sitemap import SitemapLoader
  13. from embedchain.loaders.web_page import WebPageLoader
  14. from embedchain.loaders.youtube_video import YoutubeVideoLoader
  15. class DataFormatter:
  16. """
  17. DataFormatter is an internal utility class which abstracts the mapping for
  18. loaders and chunkers to the data_type entered by the user in their
  19. .add or .add_local method call
  20. """
  21. def __init__(self, data_type: str, config: AddConfig):
  22. self.loader = self._get_loader(data_type, config.loader)
  23. self.chunker = self._get_chunker(data_type, config.chunker)
  24. def _get_loader(self, data_type, config):
  25. """
  26. Returns the appropriate data loader for the given data type.
  27. :param data_type: The type of the data to load.
  28. :return: The loader for the given data type.
  29. :raises ValueError: If an unsupported data type is provided.
  30. """
  31. loaders = {
  32. "youtube_video": YoutubeVideoLoader(),
  33. "pdf_file": PdfFileLoader(),
  34. "web_page": WebPageLoader(),
  35. "qna_pair": LocalQnaPairLoader(),
  36. "text": LocalTextLoader(),
  37. "docx": DocxFileLoader(),
  38. "sitemap": SitemapLoader(),
  39. }
  40. if data_type in loaders:
  41. return loaders[data_type]
  42. else:
  43. raise ValueError(f"Unsupported data type: {data_type}")
  44. def _get_chunker(self, data_type, config):
  45. """
  46. Returns the appropriate chunker for the given data type.
  47. :param data_type: The type of the data to chunk.
  48. :return: The chunker for the given data type.
  49. :raises ValueError: If an unsupported data type is provided.
  50. """
  51. chunkers = {
  52. "youtube_video": YoutubeVideoChunker(config),
  53. "pdf_file": PdfFileChunker(config),
  54. "web_page": WebPageChunker(config),
  55. "qna_pair": QnaPairChunker(config),
  56. "text": TextChunker(config),
  57. "docx": DocxFileChunker(config),
  58. "sitemap": WebPageChunker(config),
  59. }
  60. if data_type in chunkers:
  61. return chunkers[data_type]
  62. else:
  63. raise ValueError(f"Unsupported data type: {data_type}")