# $NetBSD: nanpa.sed,v 1.3 2023/01/28 13:12:16 jmcneill Exp $
#
# Parse HTML tables output by
# http://docs.nanpa.com/cgi-bin/npa_reports/nanpa
# Specifically, for each html table row (TR),
# print the
elements separated by colons.
#
# This could break on HTML comments.
#
:top
# Strip ^Ms
s/
//g
# Join all lines with unterminated HTML tags
/<[^>]*$/{
N
b top
}
# Replace all with EOL tag
s;[Tt][Rr]>;$;g
# Join lines with only | .
/<[Tt][Rr][^>]*>$/{
N
s/\n//g
b top
}
# Also, join all lines starting with
.
/<[TtRr][^>]*>[^$]*$/{
N
s/\n//g
b top
}
# Remove EOL markers
s/\$$//
# Remove lines not starting with
/<[Tt][Rr][^>]*>/!d
# Replace all with colon
s/[ ]*<[Tt][Dd][^>]*> */:/g
# Strip all HTML tags
s/<[^>]*>//g
# Handle HTML characters
s/ / /g
# Compress spaces/tabs
s/[ ][ ]*/ /g
# Strip leading colons
s/://
# Strip leading/trailing whitespace
s/ *//
s/ $//
# Strip HTML comments
s/^--.*$//
|