base_chunker.py 1.1 KB

1234567891011121314151617181920212223242526272829303132333435
  1. import hashlib
  2. class BaseChunker:
  3. def __init__(self, text_splitter):
  4. self.text_splitter = text_splitter
  5. def create_chunks(self, loader, src):
  6. """
  7. Loads data and chunks it.
  8. :param loader: The loader which's `load_data` method is used to create the raw data.
  9. :param src: The data to be handled by the loader. Can be a URL for remote sources or local content for local loaders.
  10. """
  11. documents = []
  12. ids = []
  13. datas = loader.load_data(src)
  14. metadatas = []
  15. for data in datas:
  16. content = data["content"]
  17. meta_data = data["meta_data"]
  18. url = meta_data["url"]
  19. chunks = self.text_splitter.split_text(content)
  20. for chunk in chunks:
  21. chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
  22. ids.append(chunk_id)
  23. documents.append(chunk)
  24. metadatas.append(meta_data)
  25. return {
  26. "documents": documents,
  27. "ids": ids,
  28. "metadatas": metadatas,
  29. }