#!/bin/sh
# hent-ordbank – Hent, pakk ut og gjer ordbankfilene klare til bruk.
#
# Copyright © 2008, 2009, 2010, 2012, 2018 Karl Ove Hufthammer <karl@huftis.org>.
#
#     This file is part of Ordbanken.
#
#     Ordbanken is free software: you can redistribute it and/or modify
#     it under the terms of the GNU General Public License as published by
#     the Free Software Foundation, either version 3 of the License, or
#     (at your option) any later version.
#
#     This program is distributed in the hope that it will be useful,
#     but WITHOUT ANY WARRANTY; without even the implied warranty of
#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#     GNU General Public License for more details.
#
#     You should have received a copy of the GNU General Public License
#     along with this program.  If not, see <http://www.gnu.org/licenses/>.

cd ..

# Hent ned filene manuelt frå http://www.edd.uio.no/prosjekt/ordbanken/data/index.html
# wget http://www.edd.uio.no/prosjekt/ordbanken/data/ordbank_bm.zip
# wget http://www.edd.uio.no/prosjekt/ordbanken/data/ordbank_nn.zip
# wget http://www.edd.uio.no/prosjekt/ordbanken/dataformat.txt

# Pakk ut filene og overskriv dei gamle.
unzip -o -a -aa ordbank_bm.zip
unzip -o -a -aa ordbank_nn.zip

# Kod filene om til UTF-8 og
# rett opp problem med linjeskift.
tmpfil=$(mktemp)
for fil in fullform_bm.txt fullform_nn.txt paradigme_bm.txt paradigme_nn.txt
do
  iconv -f latin1 -t utf8 $fil > $tmpfil
  mv $tmpfil $fil
  dos2unix $fil
done

# Sorter filene.
cd skript
  ./sorter
cd ..

# Gjer programmerings-apostrofar
# om til ekte apostrofar.
sed -i "s/'/’/g" fullform_*.txt
