summaryrefslogtreecommitdiffstats
path: root/fetch-pages
diff options
context:
space:
mode:
Diffstat (limited to 'fetch-pages')
-rwxr-xr-xfetch-pages43
1 files changed, 43 insertions, 0 deletions
diff --git a/fetch-pages b/fetch-pages
new file mode 100755
index 0000000..be2209a
--- /dev/null
+++ b/fetch-pages
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+set -e
+
+
+if [ $# -lt 1 -o $# -gt 2 ]; then
+ cat >&2 <<'ENDUSAGE'
+Usage: fetch-pages URL [ OUTDIR ]
+
+URL is the base domain of your Soup (e.g. 'kitchen.soup.io').
+
+OUTDIR defaults the current directory. A directory called 'pages' will be
+created inside the output directory.
+ENDUSAGE
+ exit 1
+fi
+
+
+BASE="http://$1"
+
+OUTDIR="$2"
+[ "$OUTDIR" ] || OUTDIR=.
+
+mkdir -p "${OUTDIR}/pages"
+
+url='/since/0?mode=own'
+
+
+# Don't download pages older than the newest "since" we already have
+LIMIT="$( ( (cd "${OUTDIR}/pages" && ls -1 -U) 2>/dev/null | sed 's/\.html$//'; echo 0) | sort -r -n | head -1 )"
+
+
+while [ "$url" ]; do
+ since="$(echo "$url" | sed -r 's#^/since/([^?]+)\?.*$#\1#')"
+ if [ "$since" -le "$LIMIT" -a "$since" -ne 0 ]; then break; fi
+
+ echo "Fetching ${BASE}${url}..." >&2
+ curl -f -L -o "${OUTDIR}/pages/${since}.html" "${BASE}${url}"
+ url="$(sed -r -n '/SOUP\.Endless\.next_url/ {s#^.*SOUP\.Endless\.next_url = '\''([^'\'']+)'\'';.*$#\1#;p}' "${OUTDIR}/pages/${since}.html")"
+
+ # Be nice, don't overload the servers!
+ sleep 1
+done