#!/usr/bin/env python # -*- coding: utf-8 -*- ''' This script is for online dictionaries. The variation of dictionaries are English-Japanese, Japanese-English and Wikipedia and they are based on the Web Service provided by EAST. You can get original and more information at http://dejizo.jp/dev/index.html But this script support only REST version, and you can get the document about it at http://dejizo.jp/dev/rest.html ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Sample code >>> import dejizo >>> items = dejizo.get_items('EJdict', 'dict') >>> items [{'expr': u'dict.', 'id': u'011348'}, {'expr': u'dicta', 'id': u'011349'}, {'expr': u'Dictaphone', 'id': u'011350'}, {'expr': u'dictate', 'id': u'011351'}, {'expr': u'dictation', 'id': u'011352'}, {'expr': u'dictator', 'id': u'011353'}, {'expr': u'dictatorial', 'id': u'011354'}, {'expr': u'dictatorship', 'id': u'011355'}, {'expr': u'diction', 'id': u'011356'}, {'expr': u'dictionary', 'id': u'011357'}] >>> detail = dejizo.get_detail('EJdict', '011348') >>> detail {'body': u'dictation\tdictator\tdictionary', 'head': u'dict.'} ''' import urllib, xml.dom.minidom, unicodedata, re ROOT = 'http://public.dejizo.jp/NetDicV09.asmx' ## def _dom(url): ''' Download XML and parse it to DOM ''' f = urllib.urlopen(url) p = f.read() f.close() return xml.dom.minidom.parseString(p) ## def _text(node): ''' Extract text data from DOM element node ''' o = u'' for child in node.childNodes: if child.nodeType == child.TEXT_NODE: o += child.data elif child.hasChildNodes(): o += _text(child) return o ## def _norm(us): ''' Normalize Unicode string ''' us = unicodedata.normalize('NFKC', us).strip() us = re.compile('\s\s+').sub(u' ', us) return us ## def get_items(dic, word, scope = 'HEADWORD', match = 'STARTWITH', size = 10): ''' This function is used for getting list of words. It contains maps whose keys are "id" and "expr" You can set some parameters as below - dic: what dictionary you want to check: "EJdict", "EdictJE" or "wpedia" - scope: target scope to search: "HEADWORD" or "ANYWHERE" - match: how to match: "STARTWITH", "ENDWITH", "EXACT" or "CONTAIN" - size: how many results do you want: I don't know neither the min nor max In this function, the parameters are NOT validated, so be careful... See also "http://dejizo.jp/dev/rest.html" for detail. ''' m = {'Dic': dic, 'Word': word.encode('utf-8'), 'Scope': scope, 'Match': match, 'Merge': 'AND', 'Prof': 'XHTML', 'PageSize': size, 'PageIndex': 0} url = '%s/SearchDicItemLite?%s' % (ROOT, urllib.urlencode(m)) doc = _dom(url) ids = doc.getElementsByTagName('ItemID') spans = doc.getElementsByTagName('span') items = [{'id': _text(id), 'expr': _text(span)} for id, span in zip(ids, spans)] return items ## def get_detail(dic, id): ''' This function is used for getting detail of a word. You can and must set only "dic" and "id". - dic: what dictionary you want to check: "EJdict", "EdictJE" or "wpedia" - id: ID of the word. You can get it by the function "get_items()" ''' m = {'Dic': dic, 'Item': id, 'Loc': '', 'Prof': 'XHTML'} url = '%s/GetDicItemLite?%s' % (ROOT, urllib.urlencode(m)) doc = _dom(url) head = _norm(_text(doc.getElementsByTagName('Head')[0])) body = _norm(_text(doc.getElementsByTagName('Body')[0])) return {'head': head, 'body': body} ## def _format_items(items): ''' This function is used for showing items only in command line mode. ''' o = [u''] for idx, item in enumerate(items): w = u' "%s":%d,' % (item['expr'], idx) if len(o[-1] + w) > 60: o.append(w) else: o[-1] += w print u'\n'.join(o) ## def _main(): ''' Command line mode ''' import sys available_dicts = {'ej': 'EJdict', 'je': 'EdictJE', 'wp': 'wpedia'} if len(sys.argv) < 3 or not available_dicts.has_key(sys.argv[1]): sys.exit(' Usage: %s ej|je|wp word' % sys.argv[0]) else: dic = available_dicts[sys.argv[1]] word = unicode(sys.argv[2], 'utf-8', 'ignore') items = get_items(dic, word) if not items: sys.exit('Woops! You got no result!') if len(sys.argv) < 4: _format_items(items) else: id = items[int(sys.argv[3])]['id'] item = get_detail(dic, id) print item['head'], ':' print item['body'].replace(u'\t', u'\n') ## if __name__ == '__main__': _main()