sitemap.py 763 B

123456789101112131415161718192021222324
  1. import requests
  2. from bs4 import BeautifulSoup
  3. from embedchain.loaders.web_page import WebPageLoader
  4. class SitemapLoader:
  5. def load_data(self, sitemap_url):
  6. """
  7. This method takes a sitemap URL as input and retrieves
  8. all the URLs to use the WebPageLoader to load content
  9. of each page.
  10. """
  11. output = []
  12. web_page_loader = WebPageLoader()
  13. response = requests.get(sitemap_url)
  14. response.raise_for_status()
  15. soup = BeautifulSoup(response.text, "xml")
  16. links = [link.text for link in soup.find_all("loc")]
  17. for link in links:
  18. each_load_data = web_page_loader.load_data(link)
  19. output.append(each_load_data)
  20. return [data[0] for data in output]