#!/usr/bin/env python # -*- coding: utf-8 -*- import re, urllib, os.path, cPickle import feedparser ################################################################################ # # configure HISTORYPATH, HISTORYMAX and DSTXMLPATH as you like # HISTORYPATH = '/path/to/history.p' HISTORYMAX = 50 DSTXMLPATH = '/path/to/voaspecialenglish.xml' # ################################################################################ ORGXMLPATH = ('http://www.voanews.com/specialenglish/customCF/' 'RecentStoriesRSS.cfm?keyword=TopStories') XML = ''' VOA News: Top Stories http://voaspecialenglish.com/ %(pubdate)s en-us VOICE OF AMERICA VOA News: Top Stories VOA News: Top Stories Up to the minute news from Voice of America Up to the minute news from Voice of America VOA Podcasts - Voice Of America - English - Special English voanews@voanews.com %(items)s ''' ITEM = '''\ %(title)s %(link)s %(pubdate)s %(link)s VOICE OF AMERICA %(title)s %(summary)s no ''' ## def load_history(p): ''' Load history list. Each element is map. ''' if os.path.exists(p): f = file(p) h = cPickle.load(f) f.close() else: h = [] return h ## def dump_history(h, p): ''' Dump history list. Adjust the length up to HISTORYMAX. ''' h.sort(cmp = lambda x, y: cmp(y['pubdate_tuple'], x['pubdate_tuple'])) while len(h) > HISTORYMAX: h.pop(-1) f = file(p, 'w') cPickle.dump(h, f) f.close() ## def save_xml(xml, p): ''' If there are any changes in XML, save it. ''' if os.path.exists(p): f = file(p) if xml == f.read(): resave = False else: resave = True f.close() else: resave = True if resave: f = file(p, 'w') f.write(xml) f.close() ## def semiplain(s): ''' Remove HTML tags and entity reference which are not allowed in Podcast feed. ''' tagpat = re.compile('<[a-z/].*?>', re.DOTALL|re.IGNORECASE) entpat = re.compile('&.*?;') spcpat = re.compile('\s\s+') s = tagpat.sub('', s) for ent in set(entpat.findall(s)): if ent not in ('&', '<', '>', ''', '"'): s = s.replace(ent, '') s = spcpat.sub(' ', s) s = s.strip() if len(s) < 4000: return s[:4000] else: return s[:3996] + ' ...' ## def get_from_article(link): ''' Extract body of article, URL of MP3 and file size of MP3 ''' bodypat = re.compile('(.*?)\s', re.DOTALL) mp3pat = re.compile('