Plucker URL's

#!/usr/bin/python import os import cgi import sys import time import gdbm import string import socket import urllib2 import traceback def do_ct(first_time=True): if first_time == True: first_time = False print 'Content-type: text/html' print sys.stderr = sys.stdout try: sys.path.insert(0,os.path.expanduser('~strombrg/lib')) import BeautifulSoup import cachedb first_separator = chr(1) second_separator = chr(2) class url_class: def __init__(self, url, filename, memotitle, comment): self.url = string.strip(url) self.urltitle = '' self.filename = filename self.memotitle = string.strip(memotitle) self.hidden = 0 self.comment = comment self.category = '' def __cmp__(self, other): if self.memotitle < other.memotitle: return -1 elif self.memotitle > other.memotitle: return 1 if self.urltitle < other.urltitle: return -1 elif self.urltitle > other.urltitle: return 1 if self.url < other.url: return -1 elif self.url > other.url: return 1 return 0 def hide(self): self.hidden = 1 def unhide(self): self.hidden = 0 def to_database_key(self): return '%s%s%s' % (self.url, first_separator, self.memotitle) def __str__(self): return "%s|%s|%s|%s" % (self.filename, self.memotitle, self.url, self.urltitle) def to_html(self): if self.hidden: s = '' else: s = '\n' if self.url.find(':') == -1: url = 'http://www.google.com/search?hl=en&q=%s&btnG=Google+Search' % self.url else: url = self.url for value in [ self.memotitle, '%s' % (url, self.urltitle[:50]), '%s' % (url, self.url[:30]), \ 'hide' % (first_separator, self.to_database_key()), 'categorize', 'comment' ]: s += '\t%s\n' % value s += '\n' return s def to_database_representation(self): s = '%s%s%s%s%s%s%s%s%d%s%s%s%s' % (self.filename, first_separator, self.memotitle, first_separator, self.url, first_separator, self.urltitle, first_separator, self.hidden, first_separator, self.category, first_separator, self.comment) # sys.stdout.write('Converting to database representation %s\n' % s) return s def get_title(self): # assume that urllib2 isn't reusing preexisting sockets somehow... # be careful to unset this timeout in all exit paths, so we don't accidentally end up with socket timeouts on sockets that shouldn't time out! socket.setdefaulttimeout(20) try: url=urllib2.urlopen(self.url) html=url.read() url.close() #except ValueError: except: socket.setdefaulttimeout(None) #self.urltitle = '(Could not open URL)' self.urltitle = '(Could not open URL)' else: socket.setdefaulttimeout(None) soup=BeautifulSoup.BeautifulSoup() soup.feed(html) titles = soup.first('title') if titles.contents: self.urltitle = soup.first('title').contents[0] else: self.urltitle = '(Could not obtain title)' del soup # print 'got urltitle', self.urltitle def from_database_representation(s): # 0 1 2 3 4 5 6 #s = '%s\0%s\0%s\0%s\0%s\0%s\0%s' % (self.filename, self.memotitle, self.url, self.urltitle, self.hidden, self.category, self.comment) fields = string.splitfields(s, first_separator) # sys.stdout.write('fields is %s\n' % str(fields)) # br() # url , filename, memotitle, comment url = url_class(fields[2], fields[0], fields[1], fields[6]) url.urltitle = fields[3] url.hidden = string.atoi(fields[4]) url.category = fields[5] # sys.stdout.write('Converted %s to %s\n' % (str(fields), str(url))) return url def delete_chars(s, c): # delete all occurences of character c from string s return string.joinfields(string.splitfields(s, c), '') def br(): print '
' def p(): print '

' def ul(): print '

' def end_ul(): print '' def extract(database): #sys.stdout.write('extract() is still a NOOP!\n') br() def chdir_base(): # could use some testing in the scenario where ~/.jpilot/plucker-urls does not preexist try: os.chdir(os.path.expanduser('~/.jpilot')) except: print 'cd ~/.jpilot failed' br() sys.exit(1) try: os.chdir(os.path.expanduser('plucker-urls')) except: try: os.mkdir('plucker-urls', 0755) except IOError: pass except: sys.stderr.write('mkdir plucker-urls failed\n') br() sys.exit(1) try: os.chdir('plucker-urls') except: sys.stdout.write('cd plucker-urls failed\n') br() def update(url_database, filename_database): num_added = 0 os.putenv('PATH', '%s:%s' % (os.environ['PATH'], os.path.expanduser('~/trees/rhel-3-i686/bin'))) #print os.environ['PATH'] os.system('par x ../MemoDB.pdb') for filename in os.listdir('.'): if filename_database.has_key(filename): continue try: file = open(filename, 'r') except: sys.stdout.write('Opening %s failed - continuing\n' % filename) br() else: memo_title = file.readline() if memo_title[:13] == 'Plucker URLs ': memo_title = memo_title[13:] while 1: line = file.readline() if not line: break line = string.strip(line) # strip out any characters with special values to this program or to C code for sep in [ first_separator, second_separator, chr(0) ]: line = delete_chars(line, sep) if line != '': url = url_class(line, filename, memo_title, '') if url_database.has_key(url.to_database_key()): # sys.stdout.write('%s: Already present: %s\n' % (time.ctime(time.time()), str(url))) # br() pass else: if line[:7] == 'http://': url.get_title() sys.stdout.write('%s: Adding: %d %s\n' % (time.ctime(time.time()), num_added, str(url))) br() url_database[url.to_database_key()] = url num_added += 1 file.close() filename_database[filename] = '' if num_added >= 500: sys.stdout.write("Won't add more URL's on this update. There've been too many, and your browser is likely to get cranky\n") br() return sys.stdout.write("Added %d URL's\n" % num_added) br() def display(database): sys.stdout.write("Plucker URL's\n") sys.stdout.write('\n') # sys.stdout.write('foobar!\n') sys.stdout.write('\n') keys = database.keys() list = map(lambda x: database[x], keys) list.sort() list.reverse() hidden_keyno = 0 visible_keyno = 0 for item in list: if item.hidden: hidden_keyno += 1 else: sys.stdout.write('%s\n' % item.to_html()) visible_keyno += 1 sys.stdout.write('

\n') p() sys.stdout.write("%d URL's visible, %d hidden, total of %d\n" % (visible_keyno, hidden_keyno, visible_keyno+hidden_keyno)) sys.stdout.write("\n") def erase_all(database): keyno=0 keys = database.keys() # keys.sort() num_keys = len(keys) for key in keys: keyno += 1 print "Erasing key %d of %d" % (keyno, num_keys) br() print 'Done' def main(): do_ct() if os.uname()[1] == 'dcs.nac.uci.edu': print 'Disabled on dcs.nac.uci.edu.
Please use http://seki.nac.uci.edu/~strombrg/plucker-urls.cgi instead.' return 0 #database = gdbm.open(os.path.expanduser('~/public_html/plucker-urls/database', 'w') #def __init__(self, databasefile, databasetype, databasemode, to_string=None, from_string=None, max_elements_in_memory=10000, read_use=True, write_use=True, write_through=True, too_many_percent=95.0, \ # we're using a write-through cache here, because: # 1) Performance isn't paramount for this project # 2) It's simpler - no cache flushing to worry about url_database = cachedb.database_cache(os.path.expanduser('~/public_html/plucker-urls/url_database'), gdbm, 'w', lambda x: x.to_database_representation(), from_database_representation, write_through=True) filename_database = gdbm.open(os.path.expanduser('~/public_html/plucker-urls/filename_database'), 'w') # for i in form.keys(): # print i,form[i] #print cgi.parse_qsl(sys.argv[1], 1, 1) # import urlparse import urllib # print '-->',urlparse.urlparse(sys.argv[1]),'<--' args = map(urllib.unquote, sys.argv[:]) if args[1:]: first_arg_list = string.splitfields(args[1], first_separator) else: first_arg_list = [] #sys.stdout.write('first_arg_list is %s\n

\n' % str(first_arg_list)) if first_arg_list[0:] and first_arg_list[0] == 'update': extract(url_database) chdir_base() update(url_database, filename_database) elif first_arg_list[0:] and first_arg_list[0] == 'display': chdir_base() display(url_database) elif first_arg_list[0:] and first_arg_list[0] == 'erase-all': chdir_base() erase_all(url_database) elif first_arg_list[0:] and first_arg_list[0] == 'hide' and first_arg_list[1:]: chdir_base() hide(first_arg_list[1:]) else: p() print 'Sorry, legal options to %s are "update" and "display" only, specified with a ? via CGI (for real life), or as an argument on the command line (for testing)' % sys.argv[0] print 'You instead gave me --> %s <--\n' % str(sys.argv[1:]) # if sys.argv[1:] and sys.argv[1] == 'update': # extract(database) # chdir_base() # update(database) # elif sys.argv[1:] and sys.argv[1] == 'display': # chdir_base() # display(database) # elif sys.argv[1:] and sys.argv[1] == 'erase-all': # chdir_base() # erase_all(database) # elif sys.argv[1:] and sys.argv[1] == 'hide' and sys.argv[2:]: # chdir_base() # hide(sys.argv[2]) # else: # p() # print 'Sorry, legal options to %s are "update" and "display" only, specified with a ? via CGI (for real life), or as an argument on the command line (for testing)' % sys.argv[0] # print 'You instead gave me --> %s <--\n' % str(sys.argv[1:]) main() except: do_ct() print print "
\n\n==> Error! Debugging information follows:

"
	traceback.print_exc()
	print "