data_formatter.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
  1. from embedchain.chunkers.base_chunker import BaseChunker
  2. from embedchain.chunkers.docs_site import DocsSiteChunker
  3. from embedchain.chunkers.docx_file import DocxFileChunker
  4. from embedchain.chunkers.images import ImagesChunker
  5. from embedchain.chunkers.json import JSONChunker
  6. from embedchain.chunkers.mdx import MdxChunker
  7. from embedchain.chunkers.notion import NotionChunker
  8. from embedchain.chunkers.openapi import OpenAPIChunker
  9. from embedchain.chunkers.pdf_file import PdfFileChunker
  10. from embedchain.chunkers.qna_pair import QnaPairChunker
  11. from embedchain.chunkers.sitemap import SitemapChunker
  12. from embedchain.chunkers.table import TableChunker
  13. from embedchain.chunkers.text import TextChunker
  14. from embedchain.chunkers.unstructured_file import UnstructuredFileChunker
  15. from embedchain.chunkers.web_page import WebPageChunker
  16. from embedchain.chunkers.xml import XmlChunker
  17. from embedchain.chunkers.youtube_video import YoutubeVideoChunker
  18. from embedchain.config import AddConfig
  19. from embedchain.config.add_config import ChunkerConfig, LoaderConfig
  20. from embedchain.helper.json_serializable import JSONSerializable
  21. from embedchain.loaders.base_loader import BaseLoader
  22. from embedchain.loaders.csv import CsvLoader
  23. from embedchain.loaders.docs_site_loader import DocsSiteLoader
  24. from embedchain.loaders.docx_file import DocxFileLoader
  25. from embedchain.loaders.images import ImagesLoader
  26. from embedchain.loaders.json import JSONLoader
  27. from embedchain.loaders.local_qna_pair import LocalQnaPairLoader
  28. from embedchain.loaders.local_text import LocalTextLoader
  29. from embedchain.loaders.mdx import MdxLoader
  30. from embedchain.loaders.openapi import OpenAPILoader
  31. from embedchain.loaders.pdf_file import PdfFileLoader
  32. from embedchain.loaders.sitemap import SitemapLoader
  33. from embedchain.loaders.unstructured_file import UnstructuredLoader
  34. from embedchain.loaders.web_page import WebPageLoader
  35. from embedchain.loaders.xml import XmlLoader
  36. from embedchain.loaders.youtube_video import YoutubeVideoLoader
  37. from embedchain.models.data_type import DataType
  38. class DataFormatter(JSONSerializable):
  39. """
  40. DataFormatter is an internal utility class which abstracts the mapping for
  41. loaders and chunkers to the data_type entered by the user in their
  42. .add or .add_local method call
  43. """
  44. def __init__(self, data_type: DataType, config: AddConfig):
  45. """
  46. Initialize a dataformatter, set data type and chunker based on datatype.
  47. :param data_type: The type of the data to load and chunk.
  48. :type data_type: DataType
  49. :param config: AddConfig instance with nested loader and chunker config attributes.
  50. :type config: AddConfig
  51. """
  52. self.loader = self._get_loader(data_type=data_type, config=config.loader)
  53. self.chunker = self._get_chunker(data_type=data_type, config=config.chunker)
  54. def _get_loader(self, data_type: DataType, config: LoaderConfig) -> BaseLoader:
  55. """
  56. Returns the appropriate data loader for the given data type.
  57. :param data_type: The type of the data to load.
  58. :type data_type: DataType
  59. :param config: Config to initialize the loader with.
  60. :type config: LoaderConfig
  61. :raises ValueError: If an unsupported data type is provided.
  62. :return: The loader for the given data type.
  63. :rtype: BaseLoader
  64. """
  65. loaders = {
  66. DataType.YOUTUBE_VIDEO: YoutubeVideoLoader,
  67. DataType.PDF_FILE: PdfFileLoader,
  68. DataType.WEB_PAGE: WebPageLoader,
  69. DataType.QNA_PAIR: LocalQnaPairLoader,
  70. DataType.TEXT: LocalTextLoader,
  71. DataType.DOCX: DocxFileLoader,
  72. DataType.SITEMAP: SitemapLoader,
  73. DataType.XML: XmlLoader,
  74. DataType.DOCS_SITE: DocsSiteLoader,
  75. DataType.CSV: CsvLoader,
  76. DataType.MDX: MdxLoader,
  77. DataType.IMAGES: ImagesLoader,
  78. DataType.UNSTRUCTURED: UnstructuredLoader,
  79. DataType.JSON: JSONLoader,
  80. DataType.OPENAPI: OpenAPILoader,
  81. }
  82. lazy_loaders = {DataType.NOTION}
  83. if data_type in loaders:
  84. loader_class: type = loaders[data_type]
  85. loader: BaseLoader = loader_class()
  86. return loader
  87. elif data_type in lazy_loaders:
  88. if data_type == DataType.NOTION:
  89. from embedchain.loaders.notion import NotionLoader
  90. return NotionLoader()
  91. else:
  92. raise ValueError(f"Unsupported data type: {data_type}")
  93. else:
  94. raise ValueError(f"Unsupported data type: {data_type}")
  95. def _get_chunker(self, data_type: DataType, config: ChunkerConfig) -> BaseChunker:
  96. """Returns the appropriate chunker for the given data type.
  97. :param data_type: The type of the data to chunk.
  98. :type data_type: DataType
  99. :param config: Config to initialize the chunker with.
  100. :type config: ChunkerConfig
  101. :raises ValueError: If an unsupported data type is provided.
  102. :return: The chunker for the given data type.
  103. :rtype: BaseChunker
  104. """
  105. chunker_classes = {
  106. DataType.YOUTUBE_VIDEO: YoutubeVideoChunker,
  107. DataType.PDF_FILE: PdfFileChunker,
  108. DataType.WEB_PAGE: WebPageChunker,
  109. DataType.QNA_PAIR: QnaPairChunker,
  110. DataType.TEXT: TextChunker,
  111. DataType.DOCX: DocxFileChunker,
  112. DataType.DOCS_SITE: DocsSiteChunker,
  113. DataType.SITEMAP: SitemapChunker,
  114. DataType.NOTION: NotionChunker,
  115. DataType.CSV: TableChunker,
  116. DataType.MDX: MdxChunker,
  117. DataType.IMAGES: ImagesChunker,
  118. DataType.XML: XmlChunker,
  119. DataType.UNSTRUCTURED: UnstructuredFileChunker,
  120. DataType.JSON: JSONChunker,
  121. DataType.OPENAPI: OpenAPIChunker,
  122. }
  123. if data_type in chunker_classes:
  124. chunker_class: type = chunker_classes[data_type]
  125. chunker: BaseChunker = chunker_class(config)
  126. chunker.set_data_type(data_type)
  127. return chunker
  128. else:
  129. raise ValueError(f"Unsupported data type: {data_type}")