data_formatter.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142
  1. from embedchain.chunkers.base_chunker import BaseChunker
  2. from embedchain.chunkers.docs_site import DocsSiteChunker
  3. from embedchain.chunkers.docx_file import DocxFileChunker
  4. from embedchain.chunkers.gmail import GmailChunker
  5. from embedchain.chunkers.images import ImagesChunker
  6. from embedchain.chunkers.json import JSONChunker
  7. from embedchain.chunkers.mdx import MdxChunker
  8. from embedchain.chunkers.notion import NotionChunker
  9. from embedchain.chunkers.openapi import OpenAPIChunker
  10. from embedchain.chunkers.pdf_file import PdfFileChunker
  11. from embedchain.chunkers.qna_pair import QnaPairChunker
  12. from embedchain.chunkers.sitemap import SitemapChunker
  13. from embedchain.chunkers.table import TableChunker
  14. from embedchain.chunkers.text import TextChunker
  15. from embedchain.chunkers.unstructured_file import UnstructuredFileChunker
  16. from embedchain.chunkers.web_page import WebPageChunker
  17. from embedchain.chunkers.xml import XmlChunker
  18. from embedchain.chunkers.youtube_video import YoutubeVideoChunker
  19. from embedchain.config import AddConfig
  20. from embedchain.config.add_config import ChunkerConfig, LoaderConfig
  21. from embedchain.helper.json_serializable import JSONSerializable
  22. from embedchain.loaders.base_loader import BaseLoader
  23. from embedchain.loaders.csv import CsvLoader
  24. from embedchain.loaders.docs_site_loader import DocsSiteLoader
  25. from embedchain.loaders.docx_file import DocxFileLoader
  26. from embedchain.loaders.gmail import GmailLoader
  27. from embedchain.loaders.images import ImagesLoader
  28. from embedchain.loaders.json import JSONLoader
  29. from embedchain.loaders.local_qna_pair import LocalQnaPairLoader
  30. from embedchain.loaders.local_text import LocalTextLoader
  31. from embedchain.loaders.mdx import MdxLoader
  32. from embedchain.loaders.openapi import OpenAPILoader
  33. from embedchain.loaders.pdf_file import PdfFileLoader
  34. from embedchain.loaders.sitemap import SitemapLoader
  35. from embedchain.loaders.unstructured_file import UnstructuredLoader
  36. from embedchain.loaders.web_page import WebPageLoader
  37. from embedchain.loaders.xml import XmlLoader
  38. from embedchain.loaders.youtube_video import YoutubeVideoLoader
  39. from embedchain.models.data_type import DataType
  40. class DataFormatter(JSONSerializable):
  41. """
  42. DataFormatter is an internal utility class which abstracts the mapping for
  43. loaders and chunkers to the data_type entered by the user in their
  44. .add or .add_local method call
  45. """
  46. def __init__(self, data_type: DataType, config: AddConfig):
  47. """
  48. Initialize a dataformatter, set data type and chunker based on datatype.
  49. :param data_type: The type of the data to load and chunk.
  50. :type data_type: DataType
  51. :param config: AddConfig instance with nested loader and chunker config attributes.
  52. :type config: AddConfig
  53. """
  54. self.loader = self._get_loader(data_type=data_type, config=config.loader)
  55. self.chunker = self._get_chunker(data_type=data_type, config=config.chunker)
  56. def _get_loader(self, data_type: DataType, config: LoaderConfig) -> BaseLoader:
  57. """
  58. Returns the appropriate data loader for the given data type.
  59. :param data_type: The type of the data to load.
  60. :type data_type: DataType
  61. :param config: Config to initialize the loader with.
  62. :type config: LoaderConfig
  63. :raises ValueError: If an unsupported data type is provided.
  64. :return: The loader for the given data type.
  65. :rtype: BaseLoader
  66. """
  67. loaders = {
  68. DataType.YOUTUBE_VIDEO: YoutubeVideoLoader,
  69. DataType.PDF_FILE: PdfFileLoader,
  70. DataType.WEB_PAGE: WebPageLoader,
  71. DataType.QNA_PAIR: LocalQnaPairLoader,
  72. DataType.TEXT: LocalTextLoader,
  73. DataType.DOCX: DocxFileLoader,
  74. DataType.SITEMAP: SitemapLoader,
  75. DataType.XML: XmlLoader,
  76. DataType.DOCS_SITE: DocsSiteLoader,
  77. DataType.CSV: CsvLoader,
  78. DataType.MDX: MdxLoader,
  79. DataType.IMAGES: ImagesLoader,
  80. DataType.UNSTRUCTURED: UnstructuredLoader,
  81. DataType.JSON: JSONLoader,
  82. DataType.OPENAPI: OpenAPILoader,
  83. DataType.GMAIL: GmailLoader,
  84. }
  85. lazy_loaders = {DataType.NOTION}
  86. if data_type in loaders:
  87. loader_class: type = loaders[data_type]
  88. loader: BaseLoader = loader_class()
  89. return loader
  90. elif data_type in lazy_loaders:
  91. if data_type == DataType.NOTION:
  92. from embedchain.loaders.notion import NotionLoader
  93. return NotionLoader()
  94. else:
  95. raise ValueError(f"Unsupported data type: {data_type}")
  96. else:
  97. raise ValueError(f"Unsupported data type: {data_type}")
  98. def _get_chunker(self, data_type: DataType, config: ChunkerConfig) -> BaseChunker:
  99. """Returns the appropriate chunker for the given data type.
  100. :param data_type: The type of the data to chunk.
  101. :type data_type: DataType
  102. :param config: Config to initialize the chunker with.
  103. :type config: ChunkerConfig
  104. :raises ValueError: If an unsupported data type is provided.
  105. :return: The chunker for the given data type.
  106. :rtype: BaseChunker
  107. """
  108. chunker_classes = {
  109. DataType.YOUTUBE_VIDEO: YoutubeVideoChunker,
  110. DataType.PDF_FILE: PdfFileChunker,
  111. DataType.WEB_PAGE: WebPageChunker,
  112. DataType.QNA_PAIR: QnaPairChunker,
  113. DataType.TEXT: TextChunker,
  114. DataType.DOCX: DocxFileChunker,
  115. DataType.DOCS_SITE: DocsSiteChunker,
  116. DataType.SITEMAP: SitemapChunker,
  117. DataType.NOTION: NotionChunker,
  118. DataType.CSV: TableChunker,
  119. DataType.MDX: MdxChunker,
  120. DataType.IMAGES: ImagesChunker,
  121. DataType.XML: XmlChunker,
  122. DataType.UNSTRUCTURED: UnstructuredFileChunker,
  123. DataType.JSON: JSONChunker,
  124. DataType.OPENAPI: OpenAPIChunker,
  125. DataType.GMAIL: GmailChunker,
  126. }
  127. if data_type in chunker_classes:
  128. chunker_class: type = chunker_classes[data_type]
  129. chunker: BaseChunker = chunker_class(config)
  130. chunker.set_data_type(data_type)
  131. return chunker
  132. else:
  133. raise ValueError(f"Unsupported data type: {data_type}")