base_chunker.py 1.2 KB

1234567891011121314151617181920212223242526272829303132333435363738
  1. import hashlib
  2. class BaseChunker:
  3. def __init__(self, text_splitter):
  4. self.text_splitter = text_splitter
  5. def create_chunks(self, loader, src):
  6. """
  7. Loads data and chunks it.
  8. :param loader: The loader which's `load_data` method is used to create the raw data.
  9. :param src: The data to be handled by the loader. Can be a URL for remote sources or local content for local loaders.
  10. """
  11. documents = []
  12. ids = []
  13. idMap = {}
  14. datas = loader.load_data(src)
  15. metadatas = []
  16. for data in datas:
  17. content = data["content"]
  18. meta_data = data["meta_data"]
  19. url = meta_data["url"]
  20. chunks = self.text_splitter.split_text(content)
  21. for chunk in chunks:
  22. chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
  23. if (idMap.get(chunk_id) is None):
  24. idMap[chunk_id] = True
  25. ids.append(chunk_id)
  26. documents.append(chunk)
  27. metadatas.append(meta_data)
  28. return {
  29. "documents": documents,
  30. "ids": ids,
  31. "metadatas": metadatas,
  32. }