#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re, urllib, os.path, cPickle
import feedparser
################################################################################
#
# configure HISTORYPATH, HISTORYMAX and DSTXMLPATH as you like
#
HISTORYPATH = '/path/to/history.p'
HISTORYMAX = 50
DSTXMLPATH = '/path/to/voaspecialenglish.xml'
#
################################################################################
ORGXMLPATH = ('http://www.voanews.com/specialenglish/customCF/'
'RecentStoriesRSS.cfm?keyword=TopStories')
XML = '''
VOA News: Top Stories
http://voaspecialenglish.com/
%(pubdate)sen-usVOICE OF AMERICAVOA News: Top StoriesVOA News: Top StoriesUp to the minute news from Voice of AmericaUp to the minute news from Voice of AmericaVOA Podcasts - Voice Of America - English - Special Englishvoanews@voanews.com
%(items)s
'''
ITEM = '''\
%(title)s
%(link)s
%(pubdate)s%(link)sVOICE OF AMERICA%(title)s%(summary)sno
'''
##
def load_history(p):
'''
Load history list. Each element is map.
'''
if os.path.exists(p):
f = file(p)
h = cPickle.load(f)
f.close()
else:
h = []
return h
##
def dump_history(h, p):
'''
Dump history list. Adjust the length up to HISTORYMAX.
'''
h.sort(cmp = lambda x, y: cmp(y['pubdate_tuple'], x['pubdate_tuple']))
while len(h) > HISTORYMAX:
h.pop(-1)
f = file(p, 'w')
cPickle.dump(h, f)
f.close()
##
def save_xml(xml, p):
'''
If there are any changes in XML, save it.
'''
if os.path.exists(p):
f = file(p)
if xml == f.read():
resave = False
else:
resave = True
f.close()
else:
resave = True
if resave:
f = file(p, 'w')
f.write(xml)
f.close()
##
def semiplain(s):
'''
Remove HTML tags and entity reference which are not allowed in
Podcast feed.
'''
tagpat = re.compile('<[a-z/].*?>', re.DOTALL|re.IGNORECASE)
entpat = re.compile('&.*?;')
spcpat = re.compile('\s\s+')
s = tagpat.sub('', s)
for ent in set(entpat.findall(s)):
if ent not in ('&', '<', '>', ''', '"'):
s = s.replace(ent, '')
s = spcpat.sub(' ', s)
s = s.strip()
if len(s) < 4000:
return s[:4000]
else:
return s[:3996] + ' ...'
##
def get_from_article(link):
'''
Extract body of article, URL of MP3 and file size of MP3
'''
bodypat = re.compile('(.*?)\s', re.DOTALL)
mp3pat = re.compile('