From 765ed07354c655210ee25586988bad98353ebeef Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Sun, 8 Jan 2017 21:28:18 +0100 Subject: Initial commit --- fetch-pages | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100755 fetch-pages (limited to 'fetch-pages') diff --git a/fetch-pages b/fetch-pages new file mode 100755 index 0000000..be2209a --- /dev/null +++ b/fetch-pages @@ -0,0 +1,43 @@ +#!/bin/bash + +set -e + + +if [ $# -lt 1 -o $# -gt 2 ]; then + cat >&2 <<'ENDUSAGE' +Usage: fetch-pages URL [ OUTDIR ] + +URL is the base domain of your Soup (e.g. 'kitchen.soup.io'). + +OUTDIR defaults the current directory. A directory called 'pages' will be +created inside the output directory. +ENDUSAGE + exit 1 +fi + + +BASE="http://$1" + +OUTDIR="$2" +[ "$OUTDIR" ] || OUTDIR=. + +mkdir -p "${OUTDIR}/pages" + +url='/since/0?mode=own' + + +# Don't download pages older than the newest "since" we already have +LIMIT="$( ( (cd "${OUTDIR}/pages" && ls -1 -U) 2>/dev/null | sed 's/\.html$//'; echo 0) | sort -r -n | head -1 )" + + +while [ "$url" ]; do + since="$(echo "$url" | sed -r 's#^/since/([^?]+)\?.*$#\1#')" + if [ "$since" -le "$LIMIT" -a "$since" -ne 0 ]; then break; fi + + echo "Fetching ${BASE}${url}..." >&2 + curl -f -L -o "${OUTDIR}/pages/${since}.html" "${BASE}${url}" + url="$(sed -r -n '/SOUP\.Endless\.next_url/ {s#^.*SOUP\.Endless\.next_url = '\''([^'\'']+)'\'';.*$#\1#;p}' "${OUTDIR}/pages/${since}.html")" + + # Be nice, don't overload the servers! + sleep 1 +done -- cgit v1.2.3