Skip to content

Commit

Permalink
add recurse option (remote only)
Browse files Browse the repository at this point in the history
  • Loading branch information
scivision committed Feb 25, 2021
1 parent bd50af6 commit 259c8c8
Show file tree
Hide file tree
Showing 6 changed files with 64 additions and 31 deletions.
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = linkcheckmd
version = 1.3.0
version = 1.3.1
author = Michael Hirsch, Ph.D.
author_email = [email protected]
url = https://github.com/scivision/linkchecker-markdown
Expand Down
9 changes: 8 additions & 1 deletion src/linkcheckmd/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,21 @@ def main():
p.add_argument("-v", "--verbose", action="store_true")
p.add_argument("--sync", help="don't use asyncio", action="store_true")
p.add_argument("-local", help="only check local files", action="store_true")
p.add_argument("-r", "--recurse", help="recurse directories under path", action="store_true")
P = p.parse_args()

if P.verbose:
logging.basicConfig(level=logging.INFO)

tic = time.monotonic()
check_links(
P.path, ext=P.ext, domain=P.domain, method=P.method, use_async=not P.sync, local=P.local
P.path,
ext=P.ext,
domain=P.domain,
method=P.method,
use_async=not P.sync,
local=P.local,
recurse=P.recurse,
)

print(f"{time.monotonic() - tic:0.3} seconds to check links")
Expand Down
42 changes: 17 additions & 25 deletions src/linkcheckmd/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import asyncio

from .coro import check_urls
from . import files

# http://www.useragentstring.com
USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:64.0) Gecko/20100101 Firefox/64.0"
Expand All @@ -20,14 +21,21 @@ def check_links(
method: str = "get",
use_async: bool = True,
local: bool = False,
recurse: bool = False,
) -> T.Iterable[tuple]:

for bad in check_local(path, ext=ext):
print(bad)
if local and recurse:
logging.error("'recurse' currently works only for remote links.")

for missing_file in check_local(path, ext=ext):
# to get an iterable/list of these, call check_local directly from your program
print(missing_file)

bad = None
if not local:
bad = check_remotes(path, domain, ext=ext, hdr=hdr, method=method, use_async=use_async)
bad = check_remotes(
path, domain, ext=ext, hdr=hdr, method=method, use_async=use_async, recurse=recurse
)

return bad

Expand All @@ -42,7 +50,7 @@ def check_local(path: Path, ext: str) -> T.Iterable[tuple[str, str]]:

path = Path(path).resolve().expanduser() # must have .resolve()

for fn in get_files(path, ext):
for fn in files.get(path, ext):
urls = glob.findall(fn.read_text(errors="ignore"))

for url in urls:
Expand Down Expand Up @@ -76,6 +84,7 @@ def check_remotes(
hdr: dict[str, str] = None,
method: str = "get",
use_async: bool = True,
recurse: bool = False,
) -> list[tuple[str, str, T.Any]]:
if domain:
pat = "https?://" + domain + r"[=a-zA-Z0-9\_\/\?\&\%\+\#\.\-]*"
Expand All @@ -87,34 +96,17 @@ def check_remotes(

logging.debug(f"regex {pat}")

flist = get_files(path, ext)

if not hdr:
hdr = {"User-Agent": USER_AGENT}

# %% session
if use_async:
urls = asyncio.run(check_urls(flist, pat, ext, hdr, method))
urls = asyncio.run(
check_urls(path, regex=pat, ext=ext, hdr=hdr, method=method, recurse=recurse)
)
else:
from .sync import check_urls as sync_urls

urls = sync_urls(flist, pat, ext, hdr)
urls = sync_urls(path, pat, ext, hdr)

return urls


def get_files(path: Path, ext: str) -> T.Iterable[Path]:

path = Path(path).expanduser().resolve()

if path.is_dir():
for p in path.iterdir():
if p.is_file() and p.suffix == ext:
yield p
elif p.is_dir() and (p / "index.md").is_file():
# Hugo PageResource
yield p / "index.md"
elif path.is_file():
yield path
else:
raise FileNotFoundError(path)
7 changes: 5 additions & 2 deletions src/linkcheckmd/coro.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@

import aiohttp

from . import files

# tuples, not lists

EXC = (
Expand All @@ -20,16 +22,17 @@


async def check_urls(
flist: T.Iterable[Path],
path: Path,
regex: str,
ext: str,
hdr: dict[str, str] = None,
method: str = "get",
recurse: bool = False,
) -> list[tuple[str, str, T.Any]]:

glob = re.compile(regex)

tasks = [check_url(fn, glob, ext, hdr, method=method) for fn in flist]
tasks = [check_url(fn, glob, ext, hdr, method=method) for fn in files.get(path, ext, recurse)]

warnings.simplefilter("ignore")

Expand Down
28 changes: 28 additions & 0 deletions src/linkcheckmd/files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from pathlib import Path
import typing as T


def get(path: Path, ext: str, recurse: bool = False) -> T.Iterable[Path]:
"""
yield files in path with suffix ext. Optionally, recurse directories.
"""

path = Path(path).expanduser().resolve()

if path.is_dir():
for p in path.iterdir():
if p.is_file() and p.suffix == ext:
yield p
elif p.is_dir():
if recurse:
yield from get(p, ext, recurse)
else:
# Hugo PageResource
for n in ("index.md", "_index.md"):
if (p / n).is_file():
yield p / n
break
elif path.is_file():
yield path
else:
raise FileNotFoundError(path)
7 changes: 5 additions & 2 deletions src/linkcheckmd/sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import urllib3
import logging

from . import files

TIMEOUT = 10
RETRYCODES = (400, 404, 405, 503)
# multiple exceptions must be tuples, not lists in general
Expand All @@ -19,11 +21,12 @@


def check_urls(
flist: T.Iterable[Path],
path: Path,
regex: str,
ext: str = ".md",
hdr: dict[str, str] = None,
verifycert: bool = False,
recurse: bool = False,
) -> list[tuple[str, str, T.Any]]:

bads: list[tuple[str, str, T.Any]] = []
Expand All @@ -37,7 +40,7 @@ def check_urls(
sess.headers.update(hdr)
sess.max_redirects = 5
# %% loop
for fn in flist:
for fn in files.get(path, ext, recurse):
for bad in check_url(fn, glob, ext, sess, hdr, verifycert):
print("\n", bad)
bads.append(bad)
Expand Down

0 comments on commit 259c8c8

Please sign in to comment.