base_chunker.py 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263
  1. import hashlib
  2. from embedchain.models.data_type import DataType
  3. class BaseChunker:
  4. def __init__(self, text_splitter):
  5. """Initialize the chunker."""
  6. self.text_splitter = text_splitter
  7. self.data_type = None
  8. def create_chunks(self, loader, src):
  9. """
  10. Loads data and chunks it.
  11. :param loader: The loader which's `load_data` method is used to create
  12. the raw data.
  13. :param src: The data to be handled by the loader. Can be a URL for
  14. remote sources or local content for local loaders.
  15. """
  16. documents = []
  17. ids = []
  18. idMap = {}
  19. datas = loader.load_data(src)
  20. metadatas = []
  21. for data in datas:
  22. content = data["content"]
  23. meta_data = data["meta_data"]
  24. # add data type to meta data to allow query using data type
  25. meta_data["data_type"] = self.data_type.value
  26. url = meta_data["url"]
  27. chunks = self.get_chunks(content)
  28. for chunk in chunks:
  29. chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
  30. if idMap.get(chunk_id) is None:
  31. idMap[chunk_id] = True
  32. ids.append(chunk_id)
  33. documents.append(chunk)
  34. metadatas.append(meta_data)
  35. return {
  36. "documents": documents,
  37. "ids": ids,
  38. "metadatas": metadatas,
  39. }
  40. def get_chunks(self, content):
  41. """
  42. Returns chunks using text splitter instance.
  43. Override in child class if custom logic.
  44. """
  45. return self.text_splitter.split_text(content)
  46. def set_data_type(self, data_type: DataType):
  47. """
  48. set the data type of chunker
  49. """
  50. self.data_type = data_type
  51. # TODO: This should be done during initialization. This means it has to be done in the child classes.