From 765ed07354c655210ee25586988bad98353ebeef Mon Sep 17 00:00:00 2001
From: Matthias Schiffer <mschiffer@universe-factory.net>
Date: Sun, 8 Jan 2017 21:28:18 +0100
Subject: Initial commit

---
 .gitignore       |  2 ++
 README.md        | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fetch-enclosures | 57 ++++++++++++++++++++++++++++++++++++++++++++++
 fetch-pages      | 43 +++++++++++++++++++++++++++++++++++
 4 files changed, 171 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 README.md
 create mode 100755 fetch-enclosures
 create mode 100755 fetch-pages

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..fc3911e
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+/pages
+/enclosures
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..98e3ac1
--- /dev/null
+++ b/README.md
@@ -0,0 +1,69 @@
+# Soup.io backup scripts
+
+## Usage
+
+This Soup.io backup solution consists of two scripts:
+
+## fetch-pages
+
+Will crawl through the Soup pages (which consist of 20 posts each) and download
+them to a given output directory.
+
+Usage: fetch-pages URL [ OUTDIR ]
+
+URL is the base domain of your Soup (e.g. 'kitchen.soup.io').
+
+OUTDIR defaults the current directory. A directory called 'pages' will be
+created inside the output directory.
+
+
+## fetch-enclosures
+
+Tries to download all enclosed images and videos of the previously downloaded
+pages.
+
+Usage: fetch-enclosures [ OUTDIR ]
+
+OUTDIR defaults the current directory. A directory called 'enclosures' will be
+created inside the output directory; the output of fetch-pages is expected in
+the 'pages' directory inside OUTDIR.
+
+
+## Bugs and missing features
+
+* A failed page download will interrupt fetch-pages. fetch-pages can't resume
+  the backup at the point it failed; either the base URL or LIMIT need to be
+  adjusted in the script, or previously downloaded pages need to be removed so
+  the LIMIT calculation will allow downloading the missing pages
+* fetch-enclosures could be adjusted to try multiple asset servers on failures.
+  Just re-running fetch-enclosures will work in case of transient failures, the
+  script will only attempt to retrieve missing files.
+* Adding a script to extract the HTML code of individual posts from the pages
+  might be interesting to allow mirroring Soups that aren't primarily made up of
+  images and videos to other blog systems.
+
+
+## LICENSE
+
+Copyright (c) 2017, Matthias Schiffer <mschiffer@universe-factory.net>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+  1. Redistributions of source code must retain the above copyright notice,
+     this list of conditions and the following disclaimer.
+  2. Redistributions in binary form must reproduce the above copyright notice,
+     this list of conditions and the following disclaimer in the documentation
+     and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/fetch-enclosures b/fetch-enclosures
new file mode 100755
index 0000000..42d5084
--- /dev/null
+++ b/fetch-enclosures
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+set -e
+shopt -s nullglob
+
+if [ $# -gt 1 ]; then
+	cat >&2 <<'ENDUSAGE'
+Usage: fetch-enclosures [ OUTDIR ]
+
+OUTDIR defaults the current directory. A directory called 'enclosures' will be
+created inside the output directory; the output of fetch-pages is expected in
+the 'pages' directory inside OUTDIR.
+ENDUSAGE
+	exit 1
+fi
+
+
+OUTDIR="$1"
+[ "$OUTDIR" ] || OUTDIR=.
+
+mkdir -p "${OUTDIR}/enclosures"
+
+
+# Cuts off resize suffixes like _800
+unresize () {
+	sed -r 's#/([^_/]+)_([^_/]+)_[^_/]+\.#/\1_\2.#'
+}
+
+# http://asset-#.soupcdn.com/asset/XXXXX/YYYY_ZZZZ.EXT will be saved as XXXXX_YYYY_ZZZZ.EXT
+filename () {
+	echo "$1" | sed -r 's#^.*/([^/]+)/([^/]+)$#\1_\2#'
+}
+
+# Using grep for this is not nice, but the Soup HTML is too broken for xsltproc...
+extract-images () {
+	grep -A 1 '<div class="imagecontainer"' | sed -n -r '/<img / {s#^.*src="([^"]+)".*$#\1#;p}'
+}
+extract-videos () {
+	grep -A 1 '<div class="embed"' | sed -n -r '/<video / {s#^.*src="(http://asset-.\.soupcdn\.com/[^"]+)".*$#\1#;p}' | grep '^http://asset-.\.soupcdn\.com/'
+}
+
+
+for page in "${OUTDIR}"/pages/*.html; do
+	(
+		cat "$page" | extract-images
+		cat "$page" | extract-videos
+	) | unresize | while read url; do
+		file="${OUTDIR}/enclosures/$(filename "$url")"
+		if [ -s "$file" ]; then
+			echo "${file} found, skipping." >&2
+			continue
+		fi
+
+		echo "Downloading ${url} to ${file}..." >&2
+		curl -f -L -o "$file" "${url}" || true
+	done
+done
diff --git a/fetch-pages b/fetch-pages
new file mode 100755
index 0000000..be2209a
--- /dev/null
+++ b/fetch-pages
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+set -e
+
+
+if [ $# -lt 1 -o $# -gt 2 ]; then
+	cat >&2 <<'ENDUSAGE'
+Usage: fetch-pages URL [ OUTDIR ]
+
+URL is the base domain of your Soup (e.g. 'kitchen.soup.io').
+
+OUTDIR defaults the current directory. A directory called 'pages' will be
+created inside the output directory.
+ENDUSAGE
+	exit 1
+fi
+
+
+BASE="http://$1"
+
+OUTDIR="$2"
+[ "$OUTDIR" ] || OUTDIR=.
+
+mkdir -p "${OUTDIR}/pages"
+
+url='/since/0?mode=own'
+
+
+# Don't download pages older than the newest "since" we already have
+LIMIT="$( ( (cd "${OUTDIR}/pages" && ls -1 -U) 2>/dev/null | sed 's/\.html$//'; echo 0) | sort -r -n | head -1 )"
+
+
+while [ "$url" ]; do
+	since="$(echo "$url" | sed -r 's#^/since/([^?]+)\?.*$#\1#')"
+	if [ "$since" -le "$LIMIT" -a "$since" -ne 0 ]; then break; fi
+
+	echo "Fetching ${BASE}${url}..." >&2
+	curl -f -L -o "${OUTDIR}/pages/${since}.html" "${BASE}${url}"
+	url="$(sed -r -n '/SOUP\.Endless\.next_url/ {s#^.*SOUP\.Endless\.next_url = '\''([^'\'']+)'\'';.*$#\1#;p}' "${OUTDIR}/pages/${since}.html")"
+
+	# Be nice, don't overload the servers!
+	sleep 1
+done
-- 
cgit v1.2.3