Zack Saadioui
8/24/2024
1
pip install langchain langchain-community
1
UnstructuredHTMLLoader
1
2
python
from langchain_community.document_loaders import UnstructuredHTMLLoader
1
2
3
python
loader = UnstructuredHTMLLoader("example_data/fake-content.html")
data = loader.load()
1
2
python
print(data)
1
2
python
[Document(page_content='My First Heading\n\nMy first paragraph.', metadata={'source': 'example_data/fake-content.html'})]
1
BSHTMLLoader
1
2
bash
pip install beautifulsoup4
1
2
python
from langchain_community.document_loaders import BSHTMLLoader
1
2
3
python
loader = BSHTMLLoader("example_data/fake-content.html")
data = loader.load()
1
2
python
print(data)
1
2
python
[Document(page_content='\n\nTest Title\n\n\nMy First Heading\nMy first paragraph.\n\n\n', metadata={'source': 'example_data/fake-content.html', 'title': 'Test Title'})]
1
SpiderLoader
1
2
bash
pip install --upgrade --quiet langchain langchain-community spider-client
1
SpiderLoader
1
2
3
4
5
6
7
8
python
from langchain_community.document_loaders import SpiderLoader
loader = SpiderLoader(
api_key="YOUR_API_KEY",
url="https://example.com",
mode="crawl"
)
data = loader.load()
1
2
python
print(data)
1
FireCrawlLoader
1
AzureAIDocumentIntelligenceLoader
1
2
python
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
1
2
3
4
5
6
7
8
python
loader = AzureAIDocumentIntelligenceLoader(
api_endpoint="<endpoint>",
api_key="<key>",
file_path="<filepath>",
api_model="prebuilt-layout"
)
documents = loader.load()
Copyright © Arsturn 2024