diff options
author | Matthias Schiffer <mschiffer@universe-factory.net> | 2017-01-08 21:28:18 +0100 |
---|---|---|
committer | Matthias Schiffer <mschiffer@universe-factory.net> | 2017-01-08 21:28:18 +0100 |
commit | 765ed07354c655210ee25586988bad98353ebeef (patch) | |
tree | d6122b1aa20e085aa83c291e85bf863b8c2802a1 /fetch-pages | |
download | soup-backup-765ed07354c655210ee25586988bad98353ebeef.tar soup-backup-765ed07354c655210ee25586988bad98353ebeef.zip |
Initial commit
Diffstat (limited to 'fetch-pages')
-rwxr-xr-x | fetch-pages | 43 |
1 files changed, 43 insertions, 0 deletions
diff --git a/fetch-pages b/fetch-pages new file mode 100755 index 0000000..be2209a --- /dev/null +++ b/fetch-pages @@ -0,0 +1,43 @@ +#!/bin/bash + +set -e + + +if [ $# -lt 1 -o $# -gt 2 ]; then + cat >&2 <<'ENDUSAGE' +Usage: fetch-pages URL [ OUTDIR ] + +URL is the base domain of your Soup (e.g. 'kitchen.soup.io'). + +OUTDIR defaults the current directory. A directory called 'pages' will be +created inside the output directory. +ENDUSAGE + exit 1 +fi + + +BASE="http://$1" + +OUTDIR="$2" +[ "$OUTDIR" ] || OUTDIR=. + +mkdir -p "${OUTDIR}/pages" + +url='/since/0?mode=own' + + +# Don't download pages older than the newest "since" we already have +LIMIT="$( ( (cd "${OUTDIR}/pages" && ls -1 -U) 2>/dev/null | sed 's/\.html$//'; echo 0) | sort -r -n | head -1 )" + + +while [ "$url" ]; do + since="$(echo "$url" | sed -r 's#^/since/([^?]+)\?.*$#\1#')" + if [ "$since" -le "$LIMIT" -a "$since" -ne 0 ]; then break; fi + + echo "Fetching ${BASE}${url}..." >&2 + curl -f -L -o "${OUTDIR}/pages/${since}.html" "${BASE}${url}" + url="$(sed -r -n '/SOUP\.Endless\.next_url/ {s#^.*SOUP\.Endless\.next_url = '\''([^'\'']+)'\'';.*$#\1#;p}' "${OUTDIR}/pages/${since}.html")" + + # Be nice, don't overload the servers! + sleep 1 +done |