Просмотр исходного кода

feat: add UA header for pdf and sitemap (#1222)

Taranjeet Singh 1 год назад
Родитель
Сommit
8f28264aec
2 измененных файлов с 8 добавлено и 2 удалено
  1. 4 1
      embedchain/loaders/pdf_file.py
  2. 4 1
      embedchain/loaders/sitemap.py

+ 4 - 1
embedchain/loaders/pdf_file.py

@@ -15,7 +15,10 @@ from embedchain.utils.misc import clean_string
 class PdfFileLoader(BaseLoader):
     def load_data(self, url):
         """Load data from a PDF file."""
-        loader = PyPDFLoader(url)
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36",  # noqa:E501
+        }
+        loader = PyPDFLoader(url, headers=headers)
         data = []
         all_content = []
         pages = loader.load_and_split()

+ 4 - 1
embedchain/loaders/sitemap.py

@@ -31,10 +31,13 @@ class SitemapLoader(BaseLoader):
     def load_data(self, sitemap_source):
         output = []
         web_page_loader = WebPageLoader()
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36",  # noqa:E501
+        }
 
         if urlparse(sitemap_source).scheme in ("http", "https"):
             try:
-                response = requests.get(sitemap_source)
+                response = requests.get(sitemap_source, headers=headers)
                 response.raise_for_status()
                 soup = BeautifulSoup(response.text, "xml")
             except requests.RequestException as e: