fetch-enclosures


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57

#!/bin/bash

set -e
shopt -s nullglob

if [ $# -gt 1 ]; then
	cat >&2 <<'ENDUSAGE'
Usage: fetch-enclosures [ OUTDIR ]

OUTDIR defaults the current directory. A directory called 'enclosures' will be
created inside the output directory; the output of fetch-pages is expected in
the 'pages' directory inside OUTDIR.
ENDUSAGE
	exit 1
fi


OUTDIR="$1"
[ "$OUTDIR" ] || OUTDIR=.

mkdir -p "${OUTDIR}/enclosures"


# Cuts off resize suffixes like _800
unresize () {
	sed -r 's#/([^_/]+)_([^_/]+)_[^_/]+\.#/\1_\2.#'
}

# http://asset-#.soupcdn.com/asset/XXXXX/YYYY_ZZZZ.EXT will be saved as XXXXX_YYYY_ZZZZ.EXT
filename () {
	echo "$1" | sed -r 's#^.*/([^/]+)/([^/]+)$#\1_\2#'
}

# Using grep for this is not nice, but the Soup HTML is too broken for xsltproc...
extract-images () {
	grep -A 1 '<div class="imagecontainer"' | sed -n -r '/<img / {s#^.*src="([^"]+)".*$#\1#;p}'
}
extract-videos () {
	grep -A 1 '<div class="embed"' | sed -n -r '/<video / {s#^.*src="(https?://(asset-.\.soupcdn\.com|asset\.soup\.io)/[^"]+)".*$#\1#;p}' | grep -E '^https?://(asset-.\.soupcdn\.com|asset\.soup\.io)/'
}


for page in "${OUTDIR}"/pages/*.html; do
	(
		cat "$page" | extract-images
		cat "$page" | extract-videos
	) | unresize | while read url; do
		file="${OUTDIR}/enclosures/$(filename "$url")"
		if [ -s "$file" ]; then
			echo "${file} found, skipping." >&2
			continue
		fi

		echo "Downloading ${url} to ${file}..." >&2
		curl -f -L -o "$file" "${url}" || true
	done
done