1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859 |
- import hashlib
- class BaseChunker:
- def __init__(self, text_splitter):
- """Initialize the chunker."""
- self.text_splitter = text_splitter
- self.data_type = None
- def create_chunks(self, loader, src):
- """
- Loads data and chunks it.
- :param loader: The loader which's `load_data` method is used to create
- the raw data.
- :param src: The data to be handled by the loader. Can be a URL for
- remote sources or local content for local loaders.
- """
- documents = []
- ids = []
- idMap = {}
- datas = loader.load_data(src)
- metadatas = []
- for data in datas:
- content = data["content"]
- meta_data = data["meta_data"]
- # add data type to meta data to allow query using data type
- meta_data["data_type"] = self.data_type
- url = meta_data["url"]
- chunks = self.get_chunks(content)
- for chunk in chunks:
- chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
- if idMap.get(chunk_id) is None:
- idMap[chunk_id] = True
- ids.append(chunk_id)
- documents.append(chunk)
- metadatas.append(meta_data)
- return {
- "documents": documents,
- "ids": ids,
- "metadatas": metadatas,
- }
- def get_chunks(self, content):
- """
- Returns chunks using text splitter instance.
- Override in child class if custom logic.
- """
- return self.text_splitter.split_text(content)
- def set_data_type(self, data_type):
- """
- set the data type of chunker
- """
- self.data_type = data_type
|