#!/bin/sh
# urlkoding – Lag sed-skript for urlkoding av teikna i fullformsordlista
#
# Copyright © 2012 Karl Ove Hufthammer <karl@huftis.org>.
#
#     This file is part of Ordbanken.
#
#     Ordbanken is free software: you can redistribute it and/or modify
#     it under the terms of the GNU General Public License as published by
#     the Free Software Foundation, either version 3 of the License, or
#     (at your option) any later version.
#
#     This program is distributed in the hope that it will be useful,
#     but WITHOUT ANY WARRANTY; without even the implied warranty of
#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#     GNU General Public License for more details.
#
#     You should have received a copy of the GNU General Public License
#     along with this program.  If not, see <http://www.gnu.org/licenses/>.


# Lag hjelpefil som kan URL-koda tekststrengar ved hjelp av «sed».
# Denne skal brukast til å laga URL for oppslag av Bokmålsordboka
# og Nynorskordboka på nett. Merk at me brukar Windows-1252-teiknkodinga,
# ordbøkene ser ut til å takla dette best (for eksempel for ord som
# inneheld apostrofar).
#
# Me legg berre til koding for teikn som finst i fullformsordlistene.
# 
teikn=$(mktemp)
urlteikn=$(mktemp)
awk 'BEGIN {FS=""; OFS="\n"} {$1=$1; print}' fullform_*.txt | sort -u | grep -v '^[a-zA-Z0-9%.]\?$' > $teikn
cat $teikn | iconv -f utf8 -t cp1252 | hexdump -v -e '1/1 "%02x\n"' | fgrep -v 0a > $urlteikn
sed -i -e 's/\//\\\//' -e 's/\$/\\$/' $teikn
echo '/###.*###/{h;s/.*###\(.*\)###.*/\1/;' > urlkoding.dat
paste -d"\t" $teikn $urlteikn | sed 's/\(.\+\)	\(..\)/s\/\1\/%\2\/g;/' >> urlkoding.dat
echo 'G;s/\(.*\)\n\(.*\)###.*###\(.*\)/\2\1\3/}' >> urlkoding.dat
