#!/bin/sh
# urlkoding – Lag sed-skript for urlkoding av teikna i fullformsordlista
#
# Copyright © 2012, 2016, 2018, 2020 Karl Ove Hufthammer <karl@huftis.org>.
#
#     This file is part of Ordbanken.
#
#     Ordbanken is free software: you can redistribute it and/or modify
#     it under the terms of the GNU General Public License as published by
#     the Free Software Foundation, either version 3 of the License, or
#     (at your option) any later version.
#
#     This program is distributed in the hope that it will be useful,
#     but WITHOUT ANY WARRANTY; without even the implied warranty of
#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#     GNU General Public License for more details.
#
#     You should have received a copy of the GNU General Public License
#     along with this program.  If not, see <http://www.gnu.org/licenses/>.


# Lag hjelpefil som kan URL-koda tekststrengar ved hjelp av «sed».
# Denne skal brukast til å laga URL for oppslag av Bokmålsordboka
# og Nynorskordboka på nett.
#
# Me legg berre til koding for teikn som finst i fullformsordlistene.
# 
teikn=$(mktemp)
urlteikn=$(mktemp)
cut -f3 -d'	' fullformsliste_*.txt | \
gawk -F '' '{ for(i=1; i<=NF; i++) if ( $i !~ /[a-zA-Z0-9%.]/ ) teikn[$i]++ } END { for(x in teikn) print x }' | \
sort -u > "$teikn"
echo "’" >> "$teikn"
hexdump -v -e '1/1 "p%02x"' < "$teikn" | sed -e 's/p0a/\n/g' -e 's/p/%/g' > "$urlteikn"
sed -i -e 's/\([$/+]\)/\\\1/g' "$teikn"
printf '/###.*###/{h;s/.*###\(.*\)###.*/\\1/;\n' > urlkoding.dat
paste -d"\t" "$teikn" "$urlteikn" | sed 's/\(.\+\)	\(.*\)/s\/\1\/\2\/g;/' >> urlkoding.dat
printf 'G;s/\(.*\)\\n\(.*\)###.*###\(.*\)/\\2\\1\\3/}\n' >> urlkoding.dat
