blob: 37b98fb4f03a916f5d7a49fc4c41d1a66095fcb1 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
|
#!/bin/bash
set -e
shopt -s nullglob
if [ $# -gt 1 ]; then
cat >&2 <<'ENDUSAGE'
Usage: fetch-enclosures [ OUTDIR ]
OUTDIR defaults the current directory. A directory called 'enclosures' will be
created inside the output directory; the output of fetch-pages is expected in
the 'pages' directory inside OUTDIR.
ENDUSAGE
exit 1
fi
OUTDIR="$1"
[ "$OUTDIR" ] || OUTDIR=.
mkdir -p "${OUTDIR}/enclosures"
# Cuts off resize suffixes like _800
unresize () {
sed -r 's#/([^_/]+)_([^_/]+)_[^_/]+\.#/\1_\2.#'
}
# http://asset-#.soupcdn.com/asset/XXXXX/YYYY_ZZZZ.EXT will be saved as XXXXX_YYYY_ZZZZ.EXT
filename () {
echo "$1" | sed -r 's#^.*/([^/]+)/([^/]+)$#\1_\2#'
}
# Using grep for this is not nice, but the Soup HTML is too broken for xsltproc...
extract-images () {
grep -A 1 '<div class="imagecontainer"' | sed -n -r '/<img / {s#^.*src="([^"]+)".*$#\1#;p}'
}
extract-videos () {
grep -A 1 '<div class="embed"' | sed -n -r '/<video / {s#^.*src="(https?://(asset-.\.soupcdn\.com|asset\.soup\.io)/[^"]+)".*$#\1#;p}' | grep -E '^https?://(asset-.\.soupcdn\.com|asset\.soup\.io)/'
}
for page in "${OUTDIR}"/pages/*.html; do
(
cat "$page" | extract-images
cat "$page" | extract-videos
) | unresize | while read url; do
file="${OUTDIR}/enclosures/$(filename "$url")"
if [ -s "$file" ]; then
echo "${file} found, skipping." >&2
continue
fi
echo "Downloading ${url} to ${file}..." >&2
curl -f -L -o "$file" "${url}" || true
done
done
|