Python 应用X: 工具集合

我相信人生是值得活的,尽管人在一生中必须遭受痛苦,卑劣,残酷,不幸和死亡的折磨,我依然深信如此.但我认为人生不一定要有意义,只是对一些人而言,他们可以使人生有意义. ---J 赫胥黎

简介

记录自己做的一些稀奇古怪的程序.

工具一: 查找单词解释

原理: http://dict.cn/ 提供了查词的引擎, 输入单词即可以得到解释. 所以这个程序只是做一些体力活. 代码如下:

(有朋友反映复制之后出错. 若如此请移步到这里下载这个脚本: http://www.uudisc.com/user/diegoyun/file/4131948

#!/usr/bin/python
# -*- coding: UTF-8 -*-
'''

DictFinder.py

@version: v1.0, updated at 2011-04-28
@author: yunshichen@gmail.com
@copyright: GPL

Description:

这个程序用于辅助英语学习. 学到一些新单词新句子之后, 我喜欢打印出来读/背.
目前实现的功能:

1>到 http://dict.cn 查找单词解释
2>将结果存到html文本.

运行例子: 见 test_it 方法.
'''

from xml.dom import minidom

import urllib
import string
import os
import sys

class DictCN:
    DICT_CN_URL = 'http://dict.cn/ws.php?q=${word}'
    def __init__(self,cache_dir):

        self.correct_word_map = {}
        self.wrong_word_map = {}

        self.__init_cache(cache_dir)

    def __init_cache(self,cache_dir):

        cache_file = cache_dir+"/cache.txt"
        xml_cache_dir = cache_dir + "/xml_cache"

        self.cache_file = cache_file
        self.cache_xml_dir = xml_cache_dir
        self.cache_dir = cache_dir
        self.cache_map = {}

        if os.path.exists(cache_file)==False:

            p,f = os.path.split(cache_file);
            if(os.path.exists(p)!=True):
                os.makedirs(p)

            foo = open(cache_file,"w")
            foo.close()

            if os.path.exists(xml_cache_dir)==False:
                os.makedirs(xml_cache_dir)

            return

        foo = open(cache_file,"r")
        for line in foo:
            # cache line would be:  word1: filename1, word2:filename2, word3:filename3

..
 line_list = line.split(",")
 for pair in line_list:
 pair = pair.strip()
 if len(pair)<1:
 continue
 pair_t = pair.split(":")
 self.cache_map[pair_t[0]] = pair_t[1]


 def __do_query_for_word(self,en_word):
 en_word = en_word.replace("\n","").strip()
 xml_cache_path = self.cache_xml_dir + "/" + en_word + ".xml"
 if en_word in self.cache_map:

 foo = open(xml_cache_path,"r" )
 data = foo.read()
 foo.close()

 print "--> Found " + en_word + " at cache files"

 else:

 aurl = string.Template(self.DICT_CN_URL).substitute({'word':en_word})
 # minidom can not parse content with GBK encoding, so must convert it.
 data = urllib.urlopen(aurl).read().decode('gbk').encode('utf8')
 # minidom can not parse with xml specification, so must remove it.
 data = data.replace('<?xml version="1.0" encoding="GBK" ?>','')

 self.update_cache(en_word, data, xml_cache_path)

 print "--> Found " + en_word + " from website and update to cache."

 xmldoc = minidom.parseString(data)
 return xmldoc

 def update_cache(self,en_word,data,xml_cache_path):

 foo = open(xml_cache_path,"w")
 foo.write(data+"\n")
 foo.close()

 pdir, fname = os.path.split(xml_cache_path)
 foo = open(self.cache_file,"a")
 foo.write(en_word+":"+fname.strip()+",")
 foo.close()

 self.cache_map[en_word] = fname


 def do_query(self,word_strings):
 '''
 for example, girl,name,national
 '''
 wlist = word_strings.split(",")

 for ww in wlist:
 self.handle_result(ww,None)

 def handle_result(self,ww,sen):
 result = self.__do_query_for_word(ww)
 if(self.is_word_corrent(result)):
 self.correct_word_map[ww] = DictcnWord(ww,result,sen)
 else:
 self.wrong_word_map[ww] = result

 def do_query_with_sentence(self,word_strings):
 '''
 for example: this is an example | girl: a girl is over there |

.
 '''
 pairlist = word_strings.split("|")

 for pair in pairlist:
 if pair.strip()=="":
 continue

 pair = pair.split(":")

 ww = pair[0].strip()

 sen = None

 if len(pair)==2:
 sen = pair[1]

 self.handle_result(ww,sen)

 def do_query_from_text_file(self,fpath):

 if not os.path.exists(fpath):
 print "-->file: "+fpath + " does not exist."
 sys.exit()

 foo = open(fpath,"r")
 pick_up_new_word = False
 new_word_list = []
 for line in foo:
 line = line.strip()
 if line=="" or line.startswith("##") or line.startswith("=="):
 continue

 # TODO: Just support new_word now.
 if line.startswith("--new_word"):
 pick_up_new_word = True
 continue

 if pick_up_new_word:
 new_word_list.append(line)

 foo.close()

 for line in new_word_list:
 self.do_query_with_sentence(line)

 def is_word_corrent(self,result):
 aa = result.getElementsByTagName("sugg")
 if( aa == None or len(aa)==0 ):
 return True
 return False


class DictcnWord():
 '''
 A sample of dictcn query result:

 <dict>
 <key>national</key>
 <lang>ec</lang>
 <audio>http://mp3.dict.cn/mp3.php?q=0FBxW</audio>
 <pron>'næʃənəl</pron>
 <def>adj.民族的, 国家的, 国立的, 全国性的
 n.国民
 （复）nationals: 全国性比赛</def>
 <sent><orig>I got my hearing aid on the National Health (Service).</orig><trans>我的助听器是国民保健署资助的。</trans></sent>
 <sent><orig>A lot has been done in the recovery of national economy in the past few years.</orig><trans>在过去的几年中我们为了国民经济的恢复做了大量的工作。</trans></sent>
 <sent><orig>The gross national product had increased 5 percent last year.</orig><trans>去年的国民生产总值提高了百分之五。</trans></sent>
 </dict>

 '''
 def __init__(self,word,xmldoc,sen=None):
 self.convert_from(word, xmldoc,sen);

 def convert_from(self,word,xmldoc,add_sen=None):
 self.word = word

 self.audio = self.get_text_from_unique_element(xmldoc, "audio")
 self.pron = self.get_text_from_unique_element(xmldoc, "pron")
 self.cn_explain = self.get_text_from_unique_element(xmldoc, "def")

 senlist = xmldoc.getElementsByTagName("sent")
 alist = []
 for sen in senlist:
 en = self.get_text_from_unique_element(sen, "orig")
 cn = self.get_text_from_unique_element(sen, "trans")
 amap = {"en":en,"cn":cn}
 alist.append(amap)

 self.sentences = alist
 if add_sen <> None:
 self.sentences.append({"en":add_sen,"cn":""})

 self.xml_string = xmldoc.toxml()
 #print self.xml_string

 def get_text_from_unique_element(self,xmldoc,tagName):
 t = xmldoc.getElementsByTagName(tagName)
 if t==None or len(t)==0 :
 print "--> Tag "+tagName+" in "+self.word+" is null"
 return ""
 t = t[0]
 return self.__get_text(t.childNodes)

 def __get_text(self,nodelist):
 rc = []
 for node in nodelist:
 if node.nodeType == node.TEXT_NODE:
 rc.append(node.data)
 return ''.join(rc)

class SimpleWordTemplate():
 HTML_BEGIN = '<html><head><meta http-equiv="content-type" content="text/html; charset=utf-8" /></head><body>'
 HTML_END = "</body></html>"

 SIMPLE_EN_TEXT = '''--------- ${word} [ ${pron} ]
 <ul>${sentence_str}</ul>'''

 SIMPLE_CN_TEXT = '''------- ${cn_expl}
 <ul>${sentence_str}</ul>'''

 SIMPLE_ALL_TEXT = '''--------- ${word} [ ${pron} ]
 ${cn_expl}
 <ul>${sentence_str}</ul>'''

 def decorate_en_text_from(self,dict_word):
 str = ""
 for ss in dict_word.sentences:
 str = str + "<li>" + ss["en"] + "</li>"

 shtml = string.Template(SimpleWordTemplate.SIMPLE_EN_TEXT).substitute({'word':dict_word.word,"pron":dict_word.pron,"sentence_str": str})
 return shtml

 def decorate_cn_text_from(self,dict_word):
 str = ""
 for ss in dict_word.sentences:
 str = str + "<li>" + ss["cn"] + "</li>"

 shtml = string.Template(SimpleWordTemplate.SIMPLE_CN_TEXT).substitute({'cn_expl':dict.cn_explain,"sentence_str": str})
 return shtml

 def decorate_all_text_from(self,dict_word):
 str = ""
 for ss in dict_word.sentences:
 str = str + "<li>" + ss["en"] + "</li>"

 shtml = string.Template(SimpleWordTemplate.SIMPLE_ALL_TEXT).substitute({'word':dict_word.word,'cn_expl':dict_word.cn_explain,"pron":dict_word.pron,"sentence_str": str})
 return shtml

 def print_en_version(self,word_map,bak_path=None):

 en_text = ""
 for key in word_map:
 en_text = en_text + self.decorate_en_text_from(word_map[key])

 en_text = SimpleWordTemplate.HTML_BEGIN + en_text + SimpleWordTemplate.HTML_END
# print en_text
 self.__print_to_html__(en_text,"d:/download/my_word_en.html")

 def print_both_en_cn(self,word_map,bak_path=None):

 en_text = ""
 for key in word_map:
 en_text = en_text + self.decorate_all_text_from(word_map[key])

 en_text = SimpleWordTemplate.HTML_BEGIN + en_text + SimpleWordTemplate.HTML_END
 if bak_path == None:
 bak_path = "d:/download/my_word_en.html"
 self.__print_to_html__(en_text,bak_path)


 def __print_to_html__(self,text,bak_path=None):

 #TODO: will move to config file
 if bak_path==None:
 bak_path = "d:/download/my_word.html"
 f = open(bak_path, "w" )
 f.write(text.encode("utf-8"))
 f.close()

 print "\n--> Create file at: " + bak_path

def test_it():
 cache_dir = "/media/dev/open_source/word_cache"
 bak_path = "/media/install/download/en_word.html"

 words_query = '''
 profound: a profound book | magnitude: magnitude university

 '''

 obj = DictCN(cache_dir)

 obj.do_query_with_sentence(words_query)

 mm = obj.correct_word_map

 template = SimpleWordTemplate()
 template.print_both_en_cn(mm,bak_path)

def test_from_file():
 cache_dir = "/media/dev/open_source/word_cache"
 bak_path = "/media/install/download/en_word.html"
 new_word_text = "/media/install/download/new_word.txt"

 scontent = """

 ################################################################
 ## 这个程序用于辅助英语学习. 学到一些新单词新句子之后, 我喜欢打印出来读/背.
 ## 略读型文章的笔记:
 ## new_word: 格式为单词: 句子 | 单词: 句子 | 单词: 句子 |

. 这些单词会用程序自动从网站找到解释.
        ################################################################

        ==seven habits of highly effective people
        --new_word:

        profound : We have transitioned from the Industrial Age into the Information Worker Age - with all of its profound consequences. |
        magnitude : These challenges are not only of a new order of magnitude, they are altogether different in kind.  |
        rumbling : These sweeping changes in society and rumbling shifts in the digitized global marketplace give rise to a very important question |
        digitize : These sweeping changes in society and rumbling shifts in the digitized global marketplace give rise to a very important question |
    """
    if not os.path.exists(new_word_text):
        parent,fname = os.path.split(new_word_text)
        if not os.path.exists(parent):
            os.makedirs(parent)

    foo = open(new_word_text,"w")
    foo.write(scontent)
    foo.close()

    obj = DictCN(cache_dir)

    obj.do_query_from_text_file(new_word_text)

    mm =  obj.correct_word_map

    template = SimpleWordTemplate()
    template.print_both_en_cn(mm,bak_path)


if __name__ =="__main__":
    print "===========  Finding explanation for English new words: "
    import time
    begin = time.clock()
    test_from_file()
    print "==== Completed. Elapsed time: " + str(time.clock() - begin)

posted on 2011-04-28 02:08 Chenyunshi 阅读(1744) 评论(5) 编辑收藏引用所属分类: Python2.5/2.6

# re: Python 应用X: 工具集合 2011-06-24 13:09 zeroten

我复制之后出错，能不能提供一个py文件，或者加一个复制到剪切板功能。还有这是python2.x的吧回复更多评论

# re: Python 应用X: 工具集合 2011-06-26 14:05 Chenyunshi

@zeroten

1>嘿, 真的很感谢你的评论, 帮我发现了一个问题. 程序在ubuntu下写, 没在win下测试. 现问题已改正.

2>我把程序放到了uudic上, 请在此下载: http://www.uudisc.com/user/diegoyun/file/4131948

3>是python2.x django好像目前只支持2.x 回复更多评论

刷新评论列表

只有注册用户登录后才能发表评论。

# re: Python 应用X: 工具集合 2011-06-24 13:09 zeroten

# re: Python 应用X: 工具集合 2011-06-26 14:05 Chenyunshi

yunshichen

Python 应用X: 工具集合

简介

工具一: 查找单词解释

评论

导航

统计

常用链接

留言簿(7)

随笔分类

随笔档案

文章分类

相册

搜索

最新评论

阅读排行榜

评论排行榜