base_chunker.py 2.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. import hashlib
  2. from embedchain.helper.json_serializable import JSONSerializable
  3. from embedchain.models.data_type import DataType
  4. class BaseChunker(JSONSerializable):
  5. def __init__(self, text_splitter):
  6. """Initialize the chunker."""
  7. self.text_splitter = text_splitter
  8. self.data_type = None
  9. def create_chunks(self, loader, src, app_id=None):
  10. """
  11. Loads data and chunks it.
  12. :param loader: The loader which's `load_data` method is used to create
  13. the raw data.
  14. :param src: The data to be handled by the loader. Can be a URL for
  15. remote sources or local content for local loaders.
  16. :param app_id: App id used to generate the doc_id.
  17. """
  18. documents = []
  19. chunk_ids = []
  20. idMap = {}
  21. data_result = loader.load_data(src)
  22. data_records = data_result["data"]
  23. doc_id = data_result["doc_id"]
  24. # Prefix app_id in the document id if app_id is not None to
  25. # distinguish between different documents stored in the same
  26. # elasticsearch or opensearch index
  27. doc_id = f"{app_id}--{doc_id}" if app_id is not None else doc_id
  28. metadatas = []
  29. for data in data_records:
  30. content = data["content"]
  31. meta_data = data["meta_data"]
  32. # add data type to meta data to allow query using data type
  33. meta_data["data_type"] = self.data_type.value
  34. meta_data["doc_id"] = doc_id
  35. url = meta_data["url"]
  36. chunks = self.get_chunks(content)
  37. for chunk in chunks:
  38. chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
  39. chunk_id = f"{app_id}--{chunk_id}" if app_id is not None else chunk_id
  40. if idMap.get(chunk_id) is None:
  41. idMap[chunk_id] = True
  42. chunk_ids.append(chunk_id)
  43. documents.append(chunk)
  44. metadatas.append(meta_data)
  45. return {
  46. "documents": documents,
  47. "ids": chunk_ids,
  48. "metadatas": metadatas,
  49. "doc_id": doc_id,
  50. }
  51. def get_chunks(self, content):
  52. """
  53. Returns chunks using text splitter instance.
  54. Override in child class if custom logic.
  55. """
  56. return self.text_splitter.split_text(content)
  57. def set_data_type(self, data_type: DataType):
  58. """
  59. set the data type of chunker
  60. """
  61. self.data_type = data_type
  62. # TODO: This should be done during initialization. This means it has to be done in the child classes.
  63. def get_word_count(self, documents):
  64. return sum([len(document.split(" ")) for document in documents])