#!/bin/sh -e
#
# # DASHT-QUERY-HTML 1            2020-05-16                            2.4.0
#
# ## NAME
#
# dasht-query-html - searches [Dash] docsets and emits HTML table rows
#
# ## SYNOPSIS
#
# `dasht-query-html` [*PATTERN*] [*DOCSET*]...
#
# ### Examples
#
# `dasht-query-html`
#   Topics (A-Z) from each installed docset.
#
# `dasht-query-html` 'c - x'
#   Search for "c - x" in all installed docsets.
#
# `dasht-query-html` 'c - x' bash
#   Search for "c - x" only in the "bash" docset.
#
# `dasht-query-html` 'c - x' bash css
#   Search for "c - x" only in the "bash" and "css" docsets.
#
# ## DESCRIPTION
#
# Searches for *PATTERN* in all installed [Dash] docsets, optionally searching
# only in those whose names match *DOCSET*s, by calling dasht-query-line(1).
# The results are then printed, one per line, to stdout as HTML table rows.
# However, if no results were found, this program exits with a nonzero status.
#
# ### Searching
#
# Whitespace characters in *PATTERN* are treated as wildcards, whereas the
# SQL LIKE wildcard characters `%` and `_` are not: they are taken literally.
#
# Before searching, *PATTERN* is surrounded by whitespace wildcards so that it
# can match anywhere: beginning, middle, or end.  As a result, if *PATTERN* is
# undefined, it becomes a whitespace wildcard and thereby matches everything.
#
# ## ENVIRONMENT
#
# `DASHT_DOCSETS_DIR`
#   Defines the filesystem location where your [Dash] docsets are installed.
#   If undefined, its value is assumed to be `$XDG_DATA_HOME/dasht/docsets/`
#   or, if `XDG_DATA_HOME` is undefined, `$HOME/.local/share/dasht/docsets/`.
#
# ## EXIT STATUS
#
# 44
#   No results were found.
#
# ## SEE ALSO
#
# dasht-query-line(1), dasht-docsets(1), dasht(1), [Dash]
#
# [Dash]: https://kapeli.com/dash
#
# ## AUTHOR
#
# Written in 2016 by Suraj N. Kurapati <https://github.com/sunaku/dasht>
# Distributed under the terms of the ISC license (refer to README file).

trap 'exit 44' USR1 # exit with a nonzero status when no results found
{ dasht-query-line "$@" || kill -s USR1 $$ ;} | awk -v pattern="$1" '
  # Escapes XML "predefined entities" in given parameter.
  # See http://www.w3.org/TR/REC-xml/#sec-predefined-ent
  function escape(xml) {
    gsub("&",    "\\&amp;",  xml)
    gsub("\"",   "\\&quot;", xml)
    gsub("\047", "\\&#39;",  xml) # https://www.w3.org/TR/xhtml1/#C_16
    gsub("<",    "\\&lt;",   xml)
    gsub(">",    "\\&gt;",   xml)
    return xml
  }
  # Inserts the given breaker at the end of capitalized words, runs of
  # lowercase text, numbers, and punctuation marks in the given string.
  # Characters in the extras string are considered to be part of words.
  function wordbreak(string, breaker, extras) {
    gsub("[" extras "[:upper:]]+[" extras "[:lower:]]*|"\
         "[" extras "[:lower:]]+|"\
         "[" extras "[:digit:]]+|"\
         "[" extras "[:punct:]]+", "&" breaker, string)
    return string
  }
  # Try to fetch the answer from a cache to avoid repeating the same work.
  function wordbreak_cached(key, breaker, extras) {
    if (val = wordbreak_cache[key]) {}
    else { val = wordbreak_cache[key] = wordbreak(key, breaker, extras) }
    return val
  }
  # Transforms alphabetical characters into bracketed regular expressions
  # that match either lowercase or uppercase versions of those characters.
  # This basically emulates the IGNORECASE feature in a POSIX environment.
  function ignorecase(regex) {
    buf = ""
    tmp = regex
    while (pos = match(tmp, "[[:alpha:]]")) {
      chr = substr(tmp, pos, 1)
      buf = buf substr(tmp, 1, pos - 1) "[" tolower(chr) toupper(chr) "]"
      tmp = substr(tmp, pos + 1)
    }
    return buf tmp
  }
  BEGIN {
    gsub("[\\^.[$()|*+?{]", "\\\\&", pattern) # escape normal regex(7) syntax
    sub("^[[:space:]]+", "", pattern)         # strip leading whitespace
    sub("[[:space:]]+$", "", pattern)         # strip trailing whitespace
    gsub("[[:space:]]+", ".*", pattern)       # treat whitespace as wildcards
    pattern = ignorecase(pattern)             # emulate IGNORECASE=1 for POSIX
    if (pattern == "") pattern = "^."         # grouped by leading character
    FS = "\t"                                 # fields are separated by tabs
  }
  NR == 1 {
    print "<table>"
  }
  {
    name = $1
    from = wordbreak_cached($2, "<wbr>")
    type = $3
    url = $4

    # mark search terms with STX and ETX bytes which are ignored by escape()
    if (pattern) {
      gsub(pattern, "\002&\003", name)
    }

    # mark word-wrappable points with VT bytes which are ignored by escape()
    name = wordbreak(name, "\v", "\002\003")

    # escape XML entities in search result to make them visible in browsers
    name = escape(name)

    # insert word-break opportunity <wbr> tags at points marked by VT bytes
    gsub("\v", "<wbr>", name)

    # highlight search terms in search result using the STX and ETX markers
    if (pattern) {
      gsub("\002", "<b><i><u>", name)
      gsub("\003", "</u></i></b>", name)
    }

    print \
      "<tr>"\
        "<td><a href=\"" url "\">" name "</a></td>"\
        "<td valign=\"bottom\" align=\"right\">" from "</td>"\
        "<td valign=\"bottom\">" tolower(type) "</td>"\
      "</tr>"
  }
  END {
    if (NR > 0) {
      print "</table>"
      if (NR == 1) {
        # there was only one search result, so automatically visit its url
        print "<meta http-equiv=\"refresh\" content=\"0;url=" url "\">"
      }
    }
  }
'