base_chunker.py 1.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041
  1. import hashlib
  2. class BaseChunker:
  3. def __init__(self, text_splitter):
  4. """Initialize the chunker."""
  5. self.text_splitter = text_splitter
  6. def create_chunks(self, loader, src):
  7. """
  8. Loads data and chunks it.
  9. :param loader: The loader which's `load_data` method is used to create
  10. the raw data.
  11. :param src: The data to be handled by the loader. Can be a URL for
  12. remote sources or local content for local loaders.
  13. """
  14. documents = []
  15. ids = []
  16. idMap = {}
  17. datas = loader.load_data(src)
  18. metadatas = []
  19. for data in datas:
  20. content = data["content"]
  21. meta_data = data["meta_data"]
  22. url = meta_data["url"]
  23. chunks = self.text_splitter.split_text(content)
  24. for chunk in chunks:
  25. chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
  26. if idMap.get(chunk_id) is None:
  27. idMap[chunk_id] = True
  28. ids.append(chunk_id)
  29. documents.append(chunk)
  30. metadatas.append(meta_data)
  31. return {
  32. "documents": documents,
  33. "ids": ids,
  34. "metadatas": metadatas,
  35. }