#!/bin/sh -e # converts HTML from a URL, file, or stdin to markdown # uses an available program to fetch URL and tidy to normalize it first REQUIRED="tidy" SYNOPSIS="converts HTML from a URL, file, or STDIN to markdown-formatted text." ### common.sh grab_url_with () { url="${1:?internal error: grab_url_with: url required}" shift cmdline="$@" prog= prog_opts= if [ -n "$cmdline" ]; then eval "set -- $cmdline" prog=$1 shift prog_opts="$@" fi if [ -z "$prog" ]; then # Locate a sensible web grabber (note the order). for p in wget lynx w3m curl links w3c; do if pathfind $p; then prog=$p break fi done [ -n "$prog" ] || { errn "$THIS: Couldn't find a program to fetch the file from URL " err "(e.g. wget, w3m, lynx, w3c, or curl)." return 1 } else pathfind "$prog" || { err "$THIS: No such web grabber '$prog' found; aborting." return 1 } fi # Setup proper base options for known grabbers. base_opts= case "$prog" in wget) base_opts="-O-" ;; lynx) base_opts="-source" ;; w3m) base_opts="-dump_source" ;; curl) base_opts="" ;; links) base_opts="-source" ;; w3c) base_opts="-n -get" ;; *) err "$THIS: unhandled web grabber '$prog'; hope it succeeds." esac err "$THIS: invoking '$prog $base_opts $prog_opts $url'..." eval "set -- $base_opts $prog_opts" $prog "$@" "$url" } # Parse command-line arguments parse_arguments () { while [ $# -gt 0 ]; do case "$1" in --encoding=*) wholeopt="$1" # extract encoding from after = encoding="${wholeopt#*=}" ;; -e|--encoding|-encoding) shift encoding="$1" ;; --grabber=*) wholeopt="$1" # extract encoding from after = grabber="\"${wholeopt#*=}\"" ;; -g|--grabber|-grabber) shift grabber="$1" ;; *) if [ -z "$argument" ]; then argument="$1" else err "Warning: extra argument '$1' will be ignored." fi ;; esac shift done } argument= encoding= grabber= oldifs="$IFS" IFS=$NEWLINE parse_arguments $ARGS IFS="$oldifs" inurl= if [ -n "$argument" ] && ! [ -f "$argument" ]; then # Treat given argument as an URL. inurl="$argument" fi if [ -n "$inurl" ]; then err "Attempting to fetch file from '$inurl'..." ### tempdir.sh grabber_out=$THIS_TEMPDIR/grabber.out grabber_log=$THIS_TEMPDIR/grabber.log if ! grab_url_with "$inurl" "$grabber" 1>$grabber_out 2>$grabber_log; then errn "grab_url_with failed" if [ -f $grabber_log ]; then err " with the following error log." err cat >&2 $grabber_log else err . fi exit 1 fi argument="$grabber_out" fi if [ -z "$encoding" ] && [ "x$argument" != "x" ]; then # Try to determine character encoding if not specified # and input is not STDIN. encoding=$( head "$argument" | LC_ALL=C tr 'A-Z' 'a-z' | sed -ne '//dev/null | pandoc --ignore-args -r html -w markdown "$@" else if [ -f "$argument" ]; then to_utf8 "$argument" | tidy -asxhtml -utf8 2>/dev/null | pandoc --ignore-args -r html -w markdown "$@" else err "File '$argument' not found." exit 1 fi fi