youtube_video.py 996 B

123456789101112131415161718192021222324252627282930313233343536
  1. import hashlib
  2. from langchain.text_splitter import RecursiveCharacterTextSplitter
  3. TEXT_SPLITTER_CHUNK_PARAMS = {
  4. "chunk_size": 2000,
  5. "chunk_overlap": 0,
  6. "length_function": len,
  7. }
  8. TEXT_SPLITTER = RecursiveCharacterTextSplitter(**TEXT_SPLITTER_CHUNK_PARAMS)
  9. class YoutubeVideoChunker:
  10. def create_chunks(self, loader, url):
  11. documents = []
  12. ids = []
  13. datas = loader.load_data(url)
  14. metadatas = []
  15. for data in datas:
  16. content = data["content"]
  17. meta_data = data["meta_data"]
  18. chunks = TEXT_SPLITTER.split_text(content)
  19. url = meta_data["url"]
  20. for chunk in chunks:
  21. chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
  22. ids.append(chunk_id)
  23. documents.append(chunk)
  24. metadatas.append(meta_data)
  25. return {
  26. "documents": documents,
  27. "ids": ids,
  28. "metadatas": metadatas,
  29. }