blob: 6cbc60b493a05e25738cc2683b8f136bf4a0102b (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
|
#!/bin/bash
set -e
if [ $# -lt 1 -o $# -gt 2 ]; then
cat >&2 <<'ENDUSAGE'
Usage: fetch-pages URL [ OUTDIR ]
URL is the base domain of your Soup (e.g. 'kitchen.soup.io').
OUTDIR defaults the current directory. A directory called 'pages' will be
created inside the output directory.
ENDUSAGE
exit 1
fi
BASE="http://$1"
OUTDIR="$2"
[ "$OUTDIR" ] || OUTDIR=.
mkdir -p "${OUTDIR}/pages"
url='/since/0?mode=own'
# Don't download pages older than the newest "since" we already have
LIMIT="$( ( (cd "${OUTDIR}/pages" && ls -1 -U) 2>/dev/null | sed 's/\.html$//'; echo 0) | sort -r -n | head -1 )"
while [ "$url" ]; do
since="$(echo "$url" | sed -r 's#^/since/([^?]+)\?.*$#\1#')"
if [ "$since" -le "$LIMIT" -a "$since" -ne 0 ]; then break; fi
echo "Fetching ${BASE}${url}..." >&2
curl -f --retry 20 -L -o "${OUTDIR}/pages/${since}.html" "${BASE}${url}"
url="$(sed -r -n '/SOUP\.Endless\.next_url/ {s#^.*SOUP\.Endless\.next_url = '\''([^'\'']+)'\'';.*$#\1#;p}' "${OUTDIR}/pages/${since}.html")"
# Be nice, don't overload the servers!
sleep 1
done
|