Pls kindly go thru the code and let me know what to add.
from langchain.document_loaders.sitemap import SitemapLoader
sitemap_loader = SitemapLoader(web_path=“websiteURL”-sitemap.xml")
docs = sitemap_loader.load()
sitemap_loader.requests_per_second = 2
Optional: avoid [SSL: CERTIFICATE_VERIFY_FAILED]
issue
sitemap_loader.requests_kwargs = {“verify”: False}
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(chunk_size=40, chunk_overlap=0)
texts = text_splitter.split_documents(docs)