base_chunker.py 2.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364
  1. import hashlib
  2. from embedchain.helper_classes.json_serializable import JSONSerializable
  3. from embedchain.models.data_type import DataType
  4. class BaseChunker(JSONSerializable):
  5. def __init__(self, text_splitter):
  6. """Initialize the chunker."""
  7. self.text_splitter = text_splitter
  8. self.data_type = None
  9. def create_chunks(self, loader, src):
  10. """
  11. Loads data and chunks it.
  12. :param loader: The loader which's `load_data` method is used to create
  13. the raw data.
  14. :param src: The data to be handled by the loader. Can be a URL for
  15. remote sources or local content for local loaders.
  16. """
  17. documents = []
  18. ids = []
  19. idMap = {}
  20. datas = loader.load_data(src)
  21. metadatas = []
  22. for data in datas:
  23. content = data["content"]
  24. meta_data = data["meta_data"]
  25. # add data type to meta data to allow query using data type
  26. meta_data["data_type"] = self.data_type.value
  27. url = meta_data["url"]
  28. chunks = self.get_chunks(content)
  29. for chunk in chunks:
  30. chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
  31. if idMap.get(chunk_id) is None:
  32. idMap[chunk_id] = True
  33. ids.append(chunk_id)
  34. documents.append(chunk)
  35. metadatas.append(meta_data)
  36. return {
  37. "documents": documents,
  38. "ids": ids,
  39. "metadatas": metadatas,
  40. }
  41. def get_chunks(self, content):
  42. """
  43. Returns chunks using text splitter instance.
  44. Override in child class if custom logic.
  45. """
  46. return self.text_splitter.split_text(content)
  47. def set_data_type(self, data_type: DataType):
  48. """
  49. set the data type of chunker
  50. """
  51. self.data_type = data_type
  52. # TODO: This should be done during initialization. This means it has to be done in the child classes.