summaryrefslogtreecommitdiffstats
path: root/fetch-enclosures
diff options
context:
space:
mode:
Diffstat (limited to 'fetch-enclosures')
-rwxr-xr-xfetch-enclosures57
1 files changed, 57 insertions, 0 deletions
diff --git a/fetch-enclosures b/fetch-enclosures
new file mode 100755
index 0000000..42d5084
--- /dev/null
+++ b/fetch-enclosures
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+set -e
+shopt -s nullglob
+
+if [ $# -gt 1 ]; then
+ cat >&2 <<'ENDUSAGE'
+Usage: fetch-enclosures [ OUTDIR ]
+
+OUTDIR defaults the current directory. A directory called 'enclosures' will be
+created inside the output directory; the output of fetch-pages is expected in
+the 'pages' directory inside OUTDIR.
+ENDUSAGE
+ exit 1
+fi
+
+
+OUTDIR="$1"
+[ "$OUTDIR" ] || OUTDIR=.
+
+mkdir -p "${OUTDIR}/enclosures"
+
+
+# Cuts off resize suffixes like _800
+unresize () {
+ sed -r 's#/([^_/]+)_([^_/]+)_[^_/]+\.#/\1_\2.#'
+}
+
+# http://asset-#.soupcdn.com/asset/XXXXX/YYYY_ZZZZ.EXT will be saved as XXXXX_YYYY_ZZZZ.EXT
+filename () {
+ echo "$1" | sed -r 's#^.*/([^/]+)/([^/]+)$#\1_\2#'
+}
+
+# Using grep for this is not nice, but the Soup HTML is too broken for xsltproc...
+extract-images () {
+ grep -A 1 '<div class="imagecontainer"' | sed -n -r '/<img / {s#^.*src="([^"]+)".*$#\1#;p}'
+}
+extract-videos () {
+ grep -A 1 '<div class="embed"' | sed -n -r '/<video / {s#^.*src="(http://asset-.\.soupcdn\.com/[^"]+)".*$#\1#;p}' | grep '^http://asset-.\.soupcdn\.com/'
+}
+
+
+for page in "${OUTDIR}"/pages/*.html; do
+ (
+ cat "$page" | extract-images
+ cat "$page" | extract-videos
+ ) | unresize | while read url; do
+ file="${OUTDIR}/enclosures/$(filename "$url")"
+ if [ -s "$file" ]; then
+ echo "${file} found, skipping." >&2
+ continue
+ fi
+
+ echo "Downloading ${url} to ${file}..." >&2
+ curl -f -L -o "$file" "${url}" || true
+ done
+done