From 765ed07354c655210ee25586988bad98353ebeef Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Sun, 8 Jan 2017 21:28:18 +0100 Subject: Initial commit --- .gitignore | 2 ++ README.md | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ fetch-enclosures | 57 ++++++++++++++++++++++++++++++++++++++++++++++ fetch-pages | 43 +++++++++++++++++++++++++++++++++++ 4 files changed, 171 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100755 fetch-enclosures create mode 100755 fetch-pages diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fc3911e --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/pages +/enclosures diff --git a/README.md b/README.md new file mode 100644 index 0000000..98e3ac1 --- /dev/null +++ b/README.md @@ -0,0 +1,69 @@ +# Soup.io backup scripts + +## Usage + +This Soup.io backup solution consists of two scripts: + +## fetch-pages + +Will crawl through the Soup pages (which consist of 20 posts each) and download +them to a given output directory. + +Usage: fetch-pages URL [ OUTDIR ] + +URL is the base domain of your Soup (e.g. 'kitchen.soup.io'). + +OUTDIR defaults the current directory. A directory called 'pages' will be +created inside the output directory. + + +## fetch-enclosures + +Tries to download all enclosed images and videos of the previously downloaded +pages. + +Usage: fetch-enclosures [ OUTDIR ] + +OUTDIR defaults the current directory. A directory called 'enclosures' will be +created inside the output directory; the output of fetch-pages is expected in +the 'pages' directory inside OUTDIR. + + +## Bugs and missing features + +* A failed page download will interrupt fetch-pages. fetch-pages can't resume + the backup at the point it failed; either the base URL or LIMIT need to be + adjusted in the script, or previously downloaded pages need to be removed so + the LIMIT calculation will allow downloading the missing pages +* fetch-enclosures could be adjusted to try multiple asset servers on failures. + Just re-running fetch-enclosures will work in case of transient failures, the + script will only attempt to retrieve missing files. +* Adding a script to extract the HTML code of individual posts from the pages + might be interesting to allow mirroring Soups that aren't primarily made up of + images and videos to other blog systems. + + +## LICENSE + +Copyright (c) 2017, Matthias Schiffer +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/fetch-enclosures b/fetch-enclosures new file mode 100755 index 0000000..42d5084 --- /dev/null +++ b/fetch-enclosures @@ -0,0 +1,57 @@ +#!/bin/bash + +set -e +shopt -s nullglob + +if [ $# -gt 1 ]; then + cat >&2 <<'ENDUSAGE' +Usage: fetch-enclosures [ OUTDIR ] + +OUTDIR defaults the current directory. A directory called 'enclosures' will be +created inside the output directory; the output of fetch-pages is expected in +the 'pages' directory inside OUTDIR. +ENDUSAGE + exit 1 +fi + + +OUTDIR="$1" +[ "$OUTDIR" ] || OUTDIR=. + +mkdir -p "${OUTDIR}/enclosures" + + +# Cuts off resize suffixes like _800 +unresize () { + sed -r 's#/([^_/]+)_([^_/]+)_[^_/]+\.#/\1_\2.#' +} + +# http://asset-#.soupcdn.com/asset/XXXXX/YYYY_ZZZZ.EXT will be saved as XXXXX_YYYY_ZZZZ.EXT +filename () { + echo "$1" | sed -r 's#^.*/([^/]+)/([^/]+)$#\1_\2#' +} + +# Using grep for this is not nice, but the Soup HTML is too broken for xsltproc... +extract-images () { + grep -A 1 '
&2 + continue + fi + + echo "Downloading ${url} to ${file}..." >&2 + curl -f -L -o "$file" "${url}" || true + done +done diff --git a/fetch-pages b/fetch-pages new file mode 100755 index 0000000..be2209a --- /dev/null +++ b/fetch-pages @@ -0,0 +1,43 @@ +#!/bin/bash + +set -e + + +if [ $# -lt 1 -o $# -gt 2 ]; then + cat >&2 <<'ENDUSAGE' +Usage: fetch-pages URL [ OUTDIR ] + +URL is the base domain of your Soup (e.g. 'kitchen.soup.io'). + +OUTDIR defaults the current directory. A directory called 'pages' will be +created inside the output directory. +ENDUSAGE + exit 1 +fi + + +BASE="http://$1" + +OUTDIR="$2" +[ "$OUTDIR" ] || OUTDIR=. + +mkdir -p "${OUTDIR}/pages" + +url='/since/0?mode=own' + + +# Don't download pages older than the newest "since" we already have +LIMIT="$( ( (cd "${OUTDIR}/pages" && ls -1 -U) 2>/dev/null | sed 's/\.html$//'; echo 0) | sort -r -n | head -1 )" + + +while [ "$url" ]; do + since="$(echo "$url" | sed -r 's#^/since/([^?]+)\?.*$#\1#')" + if [ "$since" -le "$LIMIT" -a "$since" -ne 0 ]; then break; fi + + echo "Fetching ${BASE}${url}..." >&2 + curl -f -L -o "${OUTDIR}/pages/${since}.html" "${BASE}${url}" + url="$(sed -r -n '/SOUP\.Endless\.next_url/ {s#^.*SOUP\.Endless\.next_url = '\''([^'\'']+)'\'';.*$#\1#;p}' "${OUTDIR}/pages/${since}.html")" + + # Be nice, don't overload the servers! + sleep 1 +done -- cgit v1.2.3