base_chunker.py 1.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859
  1. import hashlib
  2. class BaseChunker:
  3. def __init__(self, text_splitter):
  4. """Initialize the chunker."""
  5. self.text_splitter = text_splitter
  6. self.data_type = None
  7. def create_chunks(self, loader, src):
  8. """
  9. Loads data and chunks it.
  10. :param loader: The loader which's `load_data` method is used to create
  11. the raw data.
  12. :param src: The data to be handled by the loader. Can be a URL for
  13. remote sources or local content for local loaders.
  14. """
  15. documents = []
  16. ids = []
  17. idMap = {}
  18. datas = loader.load_data(src)
  19. metadatas = []
  20. for data in datas:
  21. content = data["content"]
  22. meta_data = data["meta_data"]
  23. # add data type to meta data to allow query using data type
  24. meta_data["data_type"] = self.data_type
  25. url = meta_data["url"]
  26. chunks = self.get_chunks(content)
  27. for chunk in chunks:
  28. chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
  29. if idMap.get(chunk_id) is None:
  30. idMap[chunk_id] = True
  31. ids.append(chunk_id)
  32. documents.append(chunk)
  33. metadatas.append(meta_data)
  34. return {
  35. "documents": documents,
  36. "ids": ids,
  37. "metadatas": metadatas,
  38. }
  39. def get_chunks(self, content):
  40. """
  41. Returns chunks using text splitter instance.
  42. Override in child class if custom logic.
  43. """
  44. return self.text_splitter.split_text(content)
  45. def set_data_type(self, data_type):
  46. """
  47. set the data type of chunker
  48. """
  49. self.data_type = data_type