data_formatter.py 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
  1. from embedchain.loaders.youtube_video import YoutubeVideoLoader
  2. from embedchain.loaders.pdf_file import PdfFileLoader
  3. from embedchain.loaders.web_page import WebPageLoader
  4. from embedchain.loaders.local_qna_pair import LocalQnaPairLoader
  5. from embedchain.loaders.local_text import LocalTextLoader
  6. from embedchain.loaders.docx_file import DocxFileLoader
  7. from embedchain.chunkers.youtube_video import YoutubeVideoChunker
  8. from embedchain.chunkers.pdf_file import PdfFileChunker
  9. from embedchain.chunkers.web_page import WebPageChunker
  10. from embedchain.chunkers.qna_pair import QnaPairChunker
  11. from embedchain.chunkers.text import TextChunker
  12. from embedchain.chunkers.docx_file import DocxFileChunker
  13. class DataFormatter:
  14. """
  15. DataFormatter is an internal utility class which abstracts the mapping for
  16. loaders and chunkers to the data_type entered by the user in their
  17. .add or .add_local method call
  18. """
  19. def __init__(self, data_type):
  20. self.loader = self._get_loader(data_type)
  21. self.chunker = self._get_chunker(data_type)
  22. def _get_loader(self, data_type):
  23. """
  24. Returns the appropriate data loader for the given data type.
  25. :param data_type: The type of the data to load.
  26. :return: The loader for the given data type.
  27. :raises ValueError: If an unsupported data type is provided.
  28. """
  29. loaders = {
  30. 'youtube_video': YoutubeVideoLoader(),
  31. 'pdf_file': PdfFileLoader(),
  32. 'web_page': WebPageLoader(),
  33. 'qna_pair': LocalQnaPairLoader(),
  34. 'text': LocalTextLoader(),
  35. 'docx': DocxFileLoader(),
  36. }
  37. if data_type in loaders:
  38. return loaders[data_type]
  39. else:
  40. raise ValueError(f"Unsupported data type: {data_type}")
  41. def _get_chunker(self, data_type):
  42. """
  43. Returns the appropriate chunker for the given data type.
  44. :param data_type: The type of the data to chunk.
  45. :return: The chunker for the given data type.
  46. :raises ValueError: If an unsupported data type is provided.
  47. """
  48. chunkers = {
  49. 'youtube_video': YoutubeVideoChunker(),
  50. 'pdf_file': PdfFileChunker(),
  51. 'web_page': WebPageChunker(),
  52. 'qna_pair': QnaPairChunker(),
  53. 'text': TextChunker(),
  54. 'docx': DocxFileChunker(),
  55. }
  56. if data_type in chunkers:
  57. return chunkers[data_type]
  58. else:
  59. raise ValueError(f"Unsupported data type: {data_type}")