134 lines
5.7 KiB
Haskell
134 lines
5.7 KiB
Haskell
{-# LANGUAGE GHC2021 #-}
|
|
{-# LANGUAGE OverloadedStrings #-}
|
|
-- | External link classification.
|
|
--
|
|
-- Walks all @Link@ inlines and:
|
|
-- * Adds @class="link-external"@ to any link whose URL starts with
|
|
-- @http://@ or @https://@ and is not on the site's own domain.
|
|
-- * Adds @data-link-icon@ / @data-link-icon-type@ attributes for
|
|
-- per-domain brand icons (see 'domainIcon' for the full list).
|
|
-- * Adds @target="_blank" rel="noopener noreferrer"@ to external links.
|
|
module Filters.Links (apply) where
|
|
|
|
import Data.Text (Text)
|
|
import qualified Data.Text as T
|
|
import Text.Pandoc.Definition
|
|
import Text.Pandoc.Walk (walk)
|
|
|
|
import Config (siteHost)
|
|
|
|
-- | Apply link classification to the entire document.
|
|
-- Two passes: PDF links first (rewrites href to viewer URL), then external
|
|
-- link classification (operates on http/https, so no overlap).
|
|
apply :: Pandoc -> Pandoc
|
|
apply = walk classifyLink . walk classifyPdfLink
|
|
|
|
-- | Rewrite root-relative PDF links to open via the vendored PDF.js viewer.
|
|
-- Preserves the original path in @data-pdf-src@ so the popup thumbnail
|
|
-- provider can locate the corresponding @.thumb.png@ file.
|
|
-- Skips links that are already pointing at the viewer (idempotent).
|
|
--
|
|
-- Handles fragment identifiers (e.g. @/papers/foo.pdf#page=5@): the
|
|
-- fragment is stripped before the @.pdf@ suffix check and re-attached
|
|
-- after the viewer URL so PDF.js's anchor handling works.
|
|
classifyPdfLink :: Inline -> Inline
|
|
classifyPdfLink (Link (ident, classes, kvs) ils (url, title))
|
|
| "/" `T.isPrefixOf` url
|
|
, let (path, fragment) = T.break (== '#') url
|
|
, ".pdf" `T.isSuffixOf` T.toLower path
|
|
, "pdf-link" `notElem` classes =
|
|
let viewerUrl = "/pdfjs/web/viewer.html?file="
|
|
<> encodeQueryValue path <> fragment
|
|
classes' = classes ++ ["pdf-link"]
|
|
kvs' = kvs ++ [("data-pdf-src", path)]
|
|
in Link (ident, classes', kvs') ils (viewerUrl, title)
|
|
classifyPdfLink x = x
|
|
|
|
classifyLink :: Inline -> Inline
|
|
classifyLink (Link (ident, classes, kvs) ils (url, title))
|
|
| isExternal url =
|
|
let icon = domainIcon url
|
|
classes' = classes ++ ["link-external"]
|
|
kvs' = kvs
|
|
++ [("target", "_blank")]
|
|
++ [("rel", "noopener noreferrer")]
|
|
++ [("data-link-icon", icon)]
|
|
++ [("data-link-icon-type", "svg")]
|
|
in Link (ident, classes', kvs') ils (url, title)
|
|
classifyLink x = x
|
|
|
|
-- ---------------------------------------------------------------------------
|
|
-- Helpers
|
|
-- ---------------------------------------------------------------------------
|
|
|
|
-- | True if the URL points outside the site's domain.
|
|
--
|
|
-- Uses a strict hostname comparison rather than substring matching, so
|
|
-- that a hostile lookalike like @evil-example.com.attacker.com@ is
|
|
-- correctly classified as external (and gets @rel=noopener noreferrer@
|
|
-- plus @target=_blank@ applied).
|
|
isExternal :: Text -> Bool
|
|
isExternal url =
|
|
case extractHost url of
|
|
Nothing -> False
|
|
Just host ->
|
|
not (host == siteHost || ("." <> siteHost) `T.isSuffixOf` host)
|
|
|
|
-- | Extract the lowercased hostname from an absolute http(s) URL.
|
|
-- Returns 'Nothing' for non-http(s) URLs (relative paths, mailto:, etc.).
|
|
extractHost :: Text -> Maybe Text
|
|
extractHost url
|
|
| Just rest <- T.stripPrefix "https://" url = Just (hostOf rest)
|
|
| Just rest <- T.stripPrefix "http://" url = Just (hostOf rest)
|
|
| otherwise = Nothing
|
|
where
|
|
hostOf rest =
|
|
let withPort = T.takeWhile (\c -> c /= '/' && c /= '?' && c /= '#') rest
|
|
host = T.takeWhile (/= ':') withPort
|
|
in T.toLower host
|
|
|
|
-- | Icon name for the link, matching a file in /images/link-icons/<name>.svg.
|
|
domainIcon :: Text -> Text
|
|
domainIcon url
|
|
-- Scholarly / reference
|
|
| "wikipedia.org" `T.isInfixOf` url = "wikipedia"
|
|
| "arxiv.org" `T.isInfixOf` url = "arxiv"
|
|
| "doi.org" `T.isInfixOf` url = "doi"
|
|
| "worldcat.org" `T.isInfixOf` url = "worldcat"
|
|
| "orcid.org" `T.isInfixOf` url = "orcid"
|
|
| "archive.org" `T.isInfixOf` url = "internet-archive"
|
|
-- Code / software
|
|
| "github.com" `T.isInfixOf` url = "github"
|
|
| "tensorflow.org" `T.isInfixOf` url = "tensorflow"
|
|
-- AI companies
|
|
| "anthropic.com" `T.isInfixOf` url = "anthropic"
|
|
| "openai.com" `T.isInfixOf` url = "openai"
|
|
-- Social / media
|
|
| "twitter.com" `T.isInfixOf` url = "twitter"
|
|
| "x.com" `T.isInfixOf` url = "twitter"
|
|
| "reddit.com" `T.isInfixOf` url = "reddit"
|
|
| "youtube.com" `T.isInfixOf` url = "youtube"
|
|
| "youtu.be" `T.isInfixOf` url = "youtube"
|
|
| "tiktok.com" `T.isInfixOf` url = "tiktok"
|
|
| "substack.com" `T.isInfixOf` url = "substack"
|
|
| "news.ycombinator.com" `T.isInfixOf` url = "hacker-news"
|
|
-- News
|
|
| "nytimes.com" `T.isInfixOf` url = "new-york-times"
|
|
-- Institutions
|
|
| "nasa.gov" `T.isInfixOf` url = "nasa"
|
|
| "apple.com" `T.isInfixOf` url = "apple"
|
|
| otherwise = "external"
|
|
|
|
-- | Percent-encode characters that would break a @?file=@ query-string value.
|
|
-- Slashes are intentionally left unencoded so root-relative paths remain
|
|
-- readable and work correctly with PDF.js's internal fetch.
|
|
encodeQueryValue :: Text -> Text
|
|
encodeQueryValue = T.concatMap enc
|
|
where
|
|
enc ' ' = "%20"
|
|
enc '&' = "%26"
|
|
enc '?' = "%3F"
|
|
enc '+' = "%2B"
|
|
enc '"' = "%22"
|
|
enc c = T.singleton c
|