#!/bin/bash set -e if [ $# -lt 1 -o $# -gt 2 ]; then cat >&2 <<'ENDUSAGE' Usage: fetch-pages URL [ OUTDIR ] URL is the base domain of your Soup (e.g. 'kitchen.soup.io'). OUTDIR defaults the current directory. A directory called 'pages' will be created inside the output directory. ENDUSAGE exit 1 fi BASE="http://$1" OUTDIR="$2" [ "$OUTDIR" ] || OUTDIR=. mkdir -p "${OUTDIR}/pages" url='/since/0?mode=own' # Don't download pages older than the newest "since" we already have LIMIT="$( ( (cd "${OUTDIR}/pages" && ls -1 -U) 2>/dev/null | sed 's/\.html$//'; echo 0) | sort -r -n | head -1 )" while [ "$url" ]; do since="$(echo "$url" | sed -r 's#^/since/([^?]+)\?.*$#\1#')" if [ "$since" -le "$LIMIT" -a "$since" -ne 0 ]; then break; fi echo "Fetching ${BASE}${url}..." >&2 curl -f -L -o "${OUTDIR}/pages/${since}.html" "${BASE}${url}" url="$(sed -r -n '/SOUP\.Endless\.next_url/ {s#^.*SOUP\.Endless\.next_url = '\''([^'\'']+)'\'';.*$#\1#;p}' "${OUTDIR}/pages/${since}.html")" # Be nice, don't overload the servers! sleep 1 done