#!/bin/bash
#printout usage info:
if
then
echo "usage: gettextdictionary.sh SOURCE-ISO TARGET-ISO >dictionaryfile.ding";
echo "where SOURCE-ISO and TARGET-ISO are the iso-languagecodes of the source- target-language resp., e.g \"es\" \"en\" for the Spanish-English dictionary"
exit;
fi
iso=$1
iso2=$2
WIKIPATH=User:Matthias_Buchmeier/$iso-$iso2
for letter in a b c d e f g h i j k l m n o p q r s t u v w x y z 0
do
lynx -width=1000 -nolist -underscore -dump -assume_charset=utf-8 -display_charset=utf-8 "http://en.wiktionary.orghttps://en.wiktionary.org/w/index.php?title=$WIKIPATH-$letter&printable=yes" |\
awk '/::/ {gsub(/+/, " "); gsub(/^/, ""); print;}'
done
#!/bin/bash
installdir="."
#installdir=/usr/share/trans
# test for existence of lynx and awk
for PROG in lynx awk
do
command -v $PROG >/dev/null 2>&1 || { echo >&2 "Program $PROG is required but it's not installed. Aborting."; exit 1; }
done
function download {
echo "downloading "$2
cat /dev/null>$2
for letter in a b c d e f g h i j k l m n o p q r s t u v w x y z 0
do
lynx -width=1000 -nolist -underscore -dump -assume_charset=utf-8 -display_charset=utf-8 "http://en.wiktionary.orghttps://en.wiktionary.org/w/index.php?title=$1-$letter&printable=yes" |\
awk '/::/ {gsub(/+/, " "); gsub(/^/, ""); print;}'>>$2
done
}
for lang in es it pt fr nl de fi no sv cs hu pl ru ja arb cmn fa hi vi el he tr ko bg ro ca sh da
do
WIKIPATH=User:Matthias_Buchmeier/en-$lang
TARGETPATH=$installdir/en-$lang-enwiktionary.txt
download $WIKIPATH $TARGETPATH
done
for lang in es it fr fi pt
do
WIKIPATH=User:Matthias_Buchmeier/$lang-en
TARGETPATH=$installdir/$lang-en-enwiktionary.txt
download $WIKIPATH $TARGETPATH
done
The Creative Commons Attribution-ShareAlike 3.0 Unported License requires the inclusion of:
if you want to redistribute the text-dictionaries. The following code can be used to download a list of all users of en.wiktioary.
#!/bin/bash
# generates list of enwiktionary contributors, sorted by number of edits
# exclude contributors with less edits:
EDITTHREASH=200
TEMPFILE=./users-unsorted.txt
TARGET=CREDITS
#APIFLAGS=\&redirects\&aulimit=500\&auexcludegroup=bot
# include all bots, as inactive bots will be included anyhow
APIFLAGS=\&redirects\&aulimit=500
wget --quiet "https://en.wiktionary.org/w/api.php?action=query&list=allusers&format=xml&auprop=editcount&auwitheditsonly$APIFLAGS" -O - 2>>WgetErr.txt\
|gawk -f userlistfilter.awk -v THREASH=$EDITTHREASH >$TEMPFILE
NEXT=`tail -n 1 $TEMPFILE|gawk 'BEGIN {FS="\t";} /^NEXT/ {print $2;}'`
echo $NEXT
while
do
wget --quiet "https://en.wiktionary.org/w/api.php?action=query&list=allusers&format=xml&auprop=editcount&auwitheditsonly&aufrom=$NEXT$APIFLAGS" -O - 2>>WgetErr.txt\
|gawk -f userlistfilter.awk -v THREASH=$EDITTHREASH >>$TEMPFILE
NEXT=`tail -n 1 $TEMPFILE|gawk 'BEGIN {FS="\t";} /^NEXT/ {print $2;}'`
echo "$NEXT"
done
sort -r $TEMPFILE|gawk 'BEGIN {FS="\t";} /^/ {print $2;}' >$TARGET
rm $TEMPFILE
BEGIN {
RS="><";
Count_Threash=1;
if(THREASH!="") Count_Threash=THREASH;
}
/u userid/ {
ID=gensub(/(^.*name)(.*)( editcount.*$)/, "\\2", "g", $0);
COUNT=gensub(/(^.*editcount)(*)().*$/, "\\2", "g", $0);
if(1.0*COUNT>=Count_Threash) {printf "%08d\t", COUNT; print ID;}
}
/continue aufrom/ {
NEXT=gensub(/(^.*continue aufrom)(.*)( continue.*$)/, "\\2", "g", $0);
# ampersand has to percent-encoded with %26
gsub(//, "%26", NEXT);
}
END {
if(NEXT=="") { print "NEXT\tTHELASTUSERLIST"; exit;}
print "NEXT\t"NEXT;
}