diff options
Diffstat (limited to 'fetch-enclosures')
-rwxr-xr-x | fetch-enclosures | 57 |
1 files changed, 57 insertions, 0 deletions
diff --git a/fetch-enclosures b/fetch-enclosures new file mode 100755 index 0000000..42d5084 --- /dev/null +++ b/fetch-enclosures @@ -0,0 +1,57 @@ +#!/bin/bash + +set -e +shopt -s nullglob + +if [ $# -gt 1 ]; then + cat >&2 <<'ENDUSAGE' +Usage: fetch-enclosures [ OUTDIR ] + +OUTDIR defaults the current directory. A directory called 'enclosures' will be +created inside the output directory; the output of fetch-pages is expected in +the 'pages' directory inside OUTDIR. +ENDUSAGE + exit 1 +fi + + +OUTDIR="$1" +[ "$OUTDIR" ] || OUTDIR=. + +mkdir -p "${OUTDIR}/enclosures" + + +# Cuts off resize suffixes like _800 +unresize () { + sed -r 's#/([^_/]+)_([^_/]+)_[^_/]+\.#/\1_\2.#' +} + +# http://asset-#.soupcdn.com/asset/XXXXX/YYYY_ZZZZ.EXT will be saved as XXXXX_YYYY_ZZZZ.EXT +filename () { + echo "$1" | sed -r 's#^.*/([^/]+)/([^/]+)$#\1_\2#' +} + +# Using grep for this is not nice, but the Soup HTML is too broken for xsltproc... +extract-images () { + grep -A 1 '<div class="imagecontainer"' | sed -n -r '/<img / {s#^.*src="([^"]+)".*$#\1#;p}' +} +extract-videos () { + grep -A 1 '<div class="embed"' | sed -n -r '/<video / {s#^.*src="(http://asset-.\.soupcdn\.com/[^"]+)".*$#\1#;p}' | grep '^http://asset-.\.soupcdn\.com/' +} + + +for page in "${OUTDIR}"/pages/*.html; do + ( + cat "$page" | extract-images + cat "$page" | extract-videos + ) | unresize | while read url; do + file="${OUTDIR}/enclosures/$(filename "$url")" + if [ -s "$file" ]; then + echo "${file} found, skipping." >&2 + continue + fi + + echo "Downloading ${url} to ${file}..." >&2 + curl -f -L -o "$file" "${url}" || true + done +done |