From 765ed07354c655210ee25586988bad98353ebeef Mon Sep 17 00:00:00 2001
From: Matthias Schiffer <mschiffer@universe-factory.net>
Date: Sun, 8 Jan 2017 21:28:18 +0100
Subject: Initial commit

---
 fetch-enclosures | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)
 create mode 100755 fetch-enclosures

(limited to 'fetch-enclosures')

diff --git a/fetch-enclosures b/fetch-enclosures
new file mode 100755
index 0000000..42d5084
--- /dev/null
+++ b/fetch-enclosures
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+set -e
+shopt -s nullglob
+
+if [ $# -gt 1 ]; then
+	cat >&2 <<'ENDUSAGE'
+Usage: fetch-enclosures [ OUTDIR ]
+
+OUTDIR defaults the current directory. A directory called 'enclosures' will be
+created inside the output directory; the output of fetch-pages is expected in
+the 'pages' directory inside OUTDIR.
+ENDUSAGE
+	exit 1
+fi
+
+
+OUTDIR="$1"
+[ "$OUTDIR" ] || OUTDIR=.
+
+mkdir -p "${OUTDIR}/enclosures"
+
+
+# Cuts off resize suffixes like _800
+unresize () {
+	sed -r 's#/([^_/]+)_([^_/]+)_[^_/]+\.#/\1_\2.#'
+}
+
+# http://asset-#.soupcdn.com/asset/XXXXX/YYYY_ZZZZ.EXT will be saved as XXXXX_YYYY_ZZZZ.EXT
+filename () {
+	echo "$1" | sed -r 's#^.*/([^/]+)/([^/]+)$#\1_\2#'
+}
+
+# Using grep for this is not nice, but the Soup HTML is too broken for xsltproc...
+extract-images () {
+	grep -A 1 '<div class="imagecontainer"' | sed -n -r '/<img / {s#^.*src="([^"]+)".*$#\1#;p}'
+}
+extract-videos () {
+	grep -A 1 '<div class="embed"' | sed -n -r '/<video / {s#^.*src="(http://asset-.\.soupcdn\.com/[^"]+)".*$#\1#;p}' | grep '^http://asset-.\.soupcdn\.com/'
+}
+
+
+for page in "${OUTDIR}"/pages/*.html; do
+	(
+		cat "$page" | extract-images
+		cat "$page" | extract-videos
+	) | unresize | while read url; do
+		file="${OUTDIR}/enclosures/$(filename "$url")"
+		if [ -s "$file" ]; then
+			echo "${file} found, skipping." >&2
+			continue
+		fi
+
+		echo "Downloading ${url} to ${file}..." >&2
+		curl -f -L -o "$file" "${url}" || true
+	done
+done
-- 
cgit v1.2.3