Source code for langchain_community.document_loaders.hn
from typing import Any, List
from langchain_core.documents import Document
from langchain_community.document_loaders.web_base import WebBaseLoader
[docs]class HNLoader(WebBaseLoader):
    """Load `Hacker News` data.
    It loads data from either main page results or the comments page."""
[docs]    def load(self) -> List[Document]:
        """Get important HN webpage information.
        HN webpage components are:
            - title
            - content
            - source url,
            - time of post
            - author of the post
            - number of comments
            - rank of the post
        """
        soup_info = self.scrape()
        if "item" in self.web_path:
            return self.load_comments(soup_info)
        else:
            return self.load_results(soup_info) 
[docs]    def load_results(self, soup: Any) -> List[Document]:
        """Load items from an HN page."""
        items = soup.select("tr[class='athing']")
        documents = []
        for lineItem in items:
            ranking = lineItem.select_one("span[class='rank']").text
            link = lineItem.find("span", {"class": "titleline"}).find("a").get("href")
            title = lineItem.find("span", {"class": "titleline"}).text.strip()
            metadata = {
                "source": self.web_path,
                "title": title,
                "link": link,
                "ranking": ranking,
            }
            documents.append(
                Document(
                    page_content=title, link=link, ranking=ranking, metadata=metadata
                )
            )
        return documents