-
-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathcoro.py
87 lines (67 loc) · 2.39 KB
/
coro.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from __future__ import annotations
import re
import typing as T
from pathlib import Path
import warnings
import asyncio
import itertools
import logging
import aiohttp
from . import files
# tuples, not lists
EXC = (
aiohttp.client_exceptions.ClientConnectorError,
aiohttp.client_exceptions.ServerDisconnectedError,
)
OKE = asyncio.TimeoutError
TIMEOUT = 10
async def check_urls(
path: Path,
regex: str,
ext: str,
hdr: dict[str, str] = None,
method: str = "get",
recurse: bool = False,
) -> list[tuple[str, str, T.Any]]:
glob = re.compile(regex)
tasks = [check_url(fn, glob, ext, hdr, method=method) for fn in files.get(path, ext, recurse)]
warnings.simplefilter("ignore")
urls = await asyncio.gather(*tasks)
warnings.resetwarnings()
# this is per aiohttp manual, when using HTTPS SSL sites, just before closing
# the event loop, do a 250ms sleep (not for each site)
await asyncio.sleep(0.250)
return list(itertools.chain(*urls)) # flatten list of lists
async def check_url(
fn: Path, glob, ext: str, hdr: dict[str, str] = None, *, method: str = "get"
) -> list[tuple[str, str, T.Any]]:
urls = glob.findall(fn.read_text(errors="ignore"))
logging.debug(fn.name, " ".join(urls))
bad: list[tuple[str, str, T.Any]] = []
timeout = aiohttp.ClientTimeout(total=TIMEOUT)
for url in urls:
if ext == ".md":
url = url[1:-1]
try:
# anti-crawling behavior doesn't like .head() method--.get() is slower but avoids lots of false positives
async with aiohttp.ClientSession(headers=hdr, timeout=timeout) as session:
if method == "get":
async with session.get(url, allow_redirects=True) as response:
code = response.status
elif method == "head":
async with session.head(url, allow_redirects=True) as response:
code = response.status
else:
raise ValueError(f"Unknown retreive method {method}")
except OKE:
continue
except EXC as e:
bad.append((fn.name, url, e)) # e, not str(e)
print("\n", bad[-1])
continue
if code != 200:
bad.append((fn.name, url, code))
print("\n", bad[-1])
else:
logging.info(f"OK: {url:80s}")
return bad