楼主: ReneeBK
7458 25

Programming Collective Intelligence   [推广有奖]

11
andrewfu1988 发表于 2014-8-31 09:22:13 |只看作者 |坛友微信交流群

Chapter 5:Optimization

  1. import time
  2. import urllib2
  3. import xml.dom.minidom

  4. kayakkey='YOUR KEY HERE'

  5. def getkayaksession():
  6.   # Construct the URL to start a session
  7.   url='http://www.kayak.com/k/ident/apisession?token=%s&version=1' % kayakkey
  8.   
  9.   # Parse the resulting XML
  10.   doc=xml.dom.minidom.parseString(urllib2.urlopen(url).read())
  11.   
  12.   # Find <sid>xxxxxxxx</sid>
  13.   sid=doc.getElementsByTagName('sid')[0].firstChild.data
  14.   return sid

  15. def flightsearch(sid,origin,destination,depart_date):
  16.   
  17.   # Construct search URL
  18.   url='http://www.kayak.com/s/apisearch?basicmode=true&oneway=y&origin=%s' % origin
  19.   url+='&destination=%s&depart_date=%s' % (destination,depart_date)
  20.   url+='&return_date=none&depart_time=a&return_time=a'
  21.   url+='&travelers=1&cabin=e&action=doFlights&apimode=1'
  22.   url+='&_sid_=%s&version=1' % (sid)

  23.   # Get the XML
  24.   doc=xml.dom.minidom.parseString(urllib2.urlopen(url).read())

  25.   # Extract the search ID
  26.   searchid=doc.getElementsByTagName('searchid')[0].firstChild.data

  27.   return searchid

  28. def flightsearchresults(sid,searchid):
  29.   def parseprice(p):
  30.     return float(p[1:].replace(',',''))

  31.   # Polling loop
  32.   while 1:
  33.     time.sleep(2)

  34.     # Construct URL for polling
  35.     url='http://www.kayak.com/s/basic/flight?'
  36.     url+='searchid=%s&c=5&apimode=1&_sid_=%s&version=1' % (searchid,sid)
  37.     doc=xml.dom.minidom.parseString(urllib2.urlopen(url).read())

  38.     # Look for morepending tag, and wait until it is no longer true
  39.     morepending=doc.getElementsByTagName('morepending')[0].firstChild
  40.     if morepending==None or morepending.data=='false': break

  41.   # Now download the complete list
  42.   url='http://www.kayak.com/s/basic/flight?'
  43.   url+='searchid=%s&c=999&apimode=1&_sid_=%s&version=1' % (searchid,sid)
  44.   doc=xml.dom.minidom.parseString(urllib2.urlopen(url).read())

  45.   # Get the various elements as lists
  46.   prices=doc.getElementsByTagName('price')
  47.   departures=doc.getElementsByTagName('depart')
  48.   arrivals=doc.getElementsByTagName('arrive')  

  49.   # Zip them together
  50.   return zip([p.firstChild.data.split(' ')[1] for p in departures],
  51.              [p.firstChild.data.split(' ')[1] for p in arrivals],
  52.              [parseprice(p.firstChild.data) for p in prices])


  53. def createschedule(people,dest,dep,ret):
  54.   # Get a session id for these searches
  55.   sid=getkayaksession()
  56.   flights={}
  57.   
  58.   for p in people:
  59.     name,origin=p
  60.     # Outbound flight
  61.     searchid=flightsearch(sid,origin,dest,dep)
  62.     flights[(origin,dest)]=flightsearchresults(sid,searchid)
  63.    
  64.     # Return flight
  65.     searchid=flightsearch(sid,dest,origin,ret)
  66.     flights[(dest,origin)]=flightsearchresults(sid,searchid)
  67.    
  68.   return flights
复制代码

使用道具

12
life_life 发表于 2014-9-1 19:44:58 |只看作者 |坛友微信交流群

Chapter 5: Optimization

  1. import random
  2. import math

  3. # The dorms, each of which has two available spaces
  4. dorms=['Zeus','Athena','Hercules','Bacchus','Pluto']

  5. # People, along with their first and second choices
  6. prefs=[('Toby', ('Bacchus', 'Hercules')),
  7.        ('Steve', ('Zeus', 'Pluto')),
  8.        ('Karen', ('Athena', 'Zeus')),
  9.        ('Sarah', ('Zeus', 'Pluto')),
  10.        ('Dave', ('Athena', 'Bacchus')),
  11.        ('Jeff', ('Hercules', 'Pluto')),
  12.        ('Fred', ('Pluto', 'Athena')),
  13.        ('Suzie', ('Bacchus', 'Hercules')),
  14.        ('Laura', ('Bacchus', 'Hercules')),
  15.        ('James', ('Hercules', 'Athena'))]

  16. # [(0,9),(0,8),(0,7),(0,6),...,(0,0)]
  17. domain=[(0,(len(dorms)*2)-i-1) for i in range(0,len(dorms)*2)]

  18. def printsolution(vec):
  19.   slots=[]
  20.   # Create two slots for each dorm
  21.   for i in range(len(dorms)): slots+=[i,i]

  22.   # Loop over each students assignment
  23.   for i in range(len(vec)):
  24.     x=int(vec[i])

  25.     # Choose the slot from the remaining ones
  26.     dorm=dorms[slots[x]]
  27.     # Show the student and assigned dorm
  28.     print prefs[i][0],dorm
  29.     # Remove this slot
  30.     del slots[x]

  31. def dormcost(vec):
  32.   cost=0
  33.   # Create list a of slots
  34.   slots=[0,0,1,1,2,2,3,3,4,4]

  35.   # Loop over each student
  36.   for i in range(len(vec)):
  37.     x=int(vec[i])
  38.     dorm=dorms[slots[x]]
  39.     pref=prefs[i][1]
  40.     # First choice costs 0, second choice costs 1
  41.     if pref[0]==dorm: cost+=0
  42.     elif pref[1]==dorm: cost+=1
  43.     else: cost+=3
  44.     # Not on the list costs 3

  45.     # Remove selected slot
  46.     del slots[x]
  47.    
  48.   return cost
复制代码

使用道具

13
lenny1219 发表于 2014-9-22 03:48:54 |只看作者 |坛友微信交流群

Chapter 6: Document Filtering

  1. import feedparser
  2. import re

  3. # Takes a filename of URL of a blog feed and classifies the entries
  4. def read(feed,classifier):
  5.   # Get feed entries and loop over them
  6.   f=feedparser.parse(feed)
  7.   for entry in f['entries']:
  8.     print
  9.     print '-----'
  10.     # Print the contents of the entry
  11.     print 'Title:     '+entry['title'].encode('utf-8')
  12.     print 'Publisher: '+entry['publisher'].encode('utf-8')
  13.     print
  14.     print entry['summary'].encode('utf-8')
  15.    

  16.     # Combine all the text to create one item for the classifier
  17.     fulltext='%s\n%s\n%s' % (entry['title'],entry['publisher'],entry['summary'])

  18.     # Print the best guess at the current category
  19.     print 'Guess: '+str(classifier.classify(entry))

  20.     # Ask the user to specify the correct category and train on that
  21.     cl=raw_input('Enter category: ')
  22.     classifier.train(entry,cl)


  23. def entryfeatures(entry):
  24.   splitter=re.compile('\\W*')
  25.   f={}
  26.   
  27.   # Extract the title words and annotate
  28.   titlewords=[s.lower() for s in splitter.split(entry['title'])
  29.           if len(s)>2 and len(s)<20]
  30.   for w in titlewords: f['Title:'+w]=1
  31.   
  32.   # Extract the summary words
  33.   summarywords=[s.lower() for s in splitter.split(entry['summary'])
  34.           if len(s)>2 and len(s)<20]

  35.   # Count uppercase words
  36.   uc=0
  37.   for i in range(len(summarywords)):
  38.     w=summarywords[i]
  39.     f[w]=1
  40.     if w.isupper(): uc+=1
  41.    
  42.     # Get word pairs in summary as features
  43.     if i<len(summarywords)-1:
  44.       twowords=' '.join(summarywords[i:i+1])
  45.       f[twowords]=1
  46.    
  47.   # Keep creator and publisher whole
  48.   f['Publisher:'+entry['publisher']]=1

  49.   # UPPERCASE is a virtual word flagging too much shouting  
  50.   if float(uc)/len(summarywords)>0.3: f['UPPERCASE']=1
  51.   
  52.   return f
复制代码

使用道具

14
Lisrelchen 发表于 2014-10-16 10:40:05 |只看作者 |坛友微信交流群

Chapter6: Document Filtering

  1. from pysqlite2 import dbapi2 as sqlite
  2. import re
  3. import math

  4. def getwords(doc):
  5.   splitter=re.compile('\\W*')
  6.   print doc
  7.   # Split the words by non-alpha characters
  8.   words=[s.lower() for s in splitter.split(doc)
  9.           if len(s)>2 and len(s)<20]
  10.   
  11.   # Return the unique set of words only
  12.   return dict([(w,1) for w in words])

  13. class classifier:
  14.   def __init__(self,getfeatures,filename=None):
  15.     # Counts of feature/category combinations
  16.     self.fc={}
  17.     # Counts of documents in each category
  18.     self.cc={}
  19.     self.getfeatures=getfeatures
  20.    
  21.   def setdb(self,dbfile):
  22.     self.con=sqlite.connect(dbfile)   
  23.     self.con.execute('create table if not exists fc(feature,category,count)')
  24.     self.con.execute('create table if not exists cc(category,count)')


  25.   def incf(self,f,cat):
  26.     count=self.fcount(f,cat)
  27.     if count==0:
  28.       self.con.execute("insert into fc values ('%s','%s',1)"
  29.                        % (f,cat))
  30.     else:
  31.       self.con.execute(
  32.         "update fc set count=%d where feature='%s' and category='%s'"
  33.         % (count+1,f,cat))
  34.   
  35.   def fcount(self,f,cat):
  36.     res=self.con.execute(
  37.       'select count from fc where feature="%s" and category="%s"'
  38.       %(f,cat)).fetchone()
  39.     if res==None: return 0
  40.     else: return float(res[0])

  41.   def incc(self,cat):
  42.     count=self.catcount(cat)
  43.     if count==0:
  44.       self.con.execute("insert into cc values ('%s',1)" % (cat))
  45.     else:
  46.       self.con.execute("update cc set count=%d where category='%s'"
  47.                        % (count+1,cat))   

  48.   def catcount(self,cat):
  49.     res=self.con.execute('select count from cc where category="%s"'
  50.                          %(cat)).fetchone()
  51.     if res==None: return 0
  52.     else: return float(res[0])

  53.   def categories(self):
  54.     cur=self.con.execute('select category from cc');
  55.     return [d[0] for d in cur]

  56.   def totalcount(self):
  57.     res=self.con.execute('select sum(count) from cc').fetchone();
  58.     if res==None: return 0
  59.     return res[0]


  60.   def train(self,item,cat):
  61.     features=self.getfeatures(item)
  62.     # Increment the count for every feature with this category
  63.     for f in features:
  64.       self.incf(f,cat)

  65.     # Increment the count for this category
  66.     self.incc(cat)
  67.     self.con.commit()

  68.   def fprob(self,f,cat):
  69.     if self.catcount(cat)==0: return 0

  70.     # The total number of times this feature appeared in this
  71.     # category divided by the total number of items in this category
  72.     return self.fcount(f,cat)/self.catcount(cat)

  73.   def weightedprob(self,f,cat,prf,weight=1.0,ap=0.5):
  74.     # Calculate current probability
  75.     basicprob=prf(f,cat)

  76.     # Count the number of times this feature has appeared in
  77.     # all categories
  78.     totals=sum([self.fcount(f,c) for c in self.categories()])

  79.     # Calculate the weighted average
  80.     bp=((weight*ap)+(totals*basicprob))/(weight+totals)
  81.     return bp




  82. class naivebayes(classifier):
  83.   
  84.   def __init__(self,getfeatures):
  85.     classifier.__init__(self,getfeatures)
  86.     self.thresholds={}
  87.   
  88.   def docprob(self,item,cat):
  89.     features=self.getfeatures(item)   

  90.     # Multiply the probabilities of all the features together
  91.     p=1
  92.     for f in features: p*=self.weightedprob(f,cat,self.fprob)
  93.     return p

  94.   def prob(self,item,cat):
  95.     catprob=self.catcount(cat)/self.totalcount()
  96.     docprob=self.docprob(item,cat)
  97.     return docprob*catprob
  98.   
  99.   def setthreshold(self,cat,t):
  100.     self.thresholds[cat]=t
  101.    
  102.   def getthreshold(self,cat):
  103.     if cat not in self.thresholds: return 1.0
  104.     return self.thresholds[cat]
  105.   
  106.   def classify(self,item,default=None):
  107.     probs={}
  108.     # Find the category with the highest probability
  109.     max=0.0
  110.     for cat in self.categories():
  111.       probs[cat]=self.prob(item,cat)
  112.       if probs[cat]>max:
  113.         max=probs[cat]
  114.         best=cat

  115.     # Make sure the probability exceeds threshold*next best
  116.     for cat in probs:
  117.       if cat==best: continue
  118.       if probs[cat]*self.getthreshold(best)>probs[best]: return default
  119.     return best

  120. class fisherclassifier(classifier):
  121.   def cprob(self,f,cat):
  122.     # The frequency of this feature in this category   
  123.     clf=self.fprob(f,cat)
  124.     if clf==0: return 0

  125.     # The frequency of this feature in all the categories
  126.     freqsum=sum([self.fprob(f,c) for c in self.categories()])

  127.     # The probability is the frequency in this category divided by
  128.     # the overall frequency
  129.     p=clf/(freqsum)
  130.    
  131.     return p
  132.   def fisherprob(self,item,cat):
  133.     # Multiply all the probabilities together
  134.     p=1
  135.     features=self.getfeatures(item)
  136.     for f in features:
  137.       p*=(self.weightedprob(f,cat,self.cprob))

  138.     # Take the natural log and multiply by -2
  139.     fscore=-2*math.log(p)

  140.     # Use the inverse chi2 function to get a probability
  141.     return self.invchi2(fscore,len(features)*2)
  142.   def invchi2(self,chi, df):
  143.     m = chi / 2.0
  144.     sum = term = math.exp(-m)
  145.     for i in range(1, df//2):
  146.         term *= m / i
  147.         sum += term
  148.     return min(sum, 1.0)
  149.   def __init__(self,getfeatures):
  150.     classifier.__init__(self,getfeatures)
  151.     self.minimums={}

  152.   def setminimum(self,cat,min):
  153.     self.minimums[cat]=min
  154.   
  155.   def getminimum(self,cat):
  156.     if cat not in self.minimums: return 0
  157.     return self.minimums[cat]
  158.   def classify(self,item,default=None):
  159.     # Loop through looking for the best result
  160.     best=default
  161.     max=0.0
  162.     for c in self.categories():
  163.       p=self.fisherprob(item,c)
  164.       # Make sure it exceeds its minimum
  165.       if p>self.getminimum(c) and p>max:
  166.         best=c
  167.         max=p
  168.     return best


  169. def sampletrain(cl):
  170.   cl.train('Nobody owns the water.','good')
  171.   cl.train('the quick rabbit jumps fences','good')
  172.   cl.train('buy pharmaceuticals now','bad')
  173.   cl.train('make quick money at the online casino','bad')
  174.   cl.train('the quick brown fox jumps','good')
复制代码


使用道具

15
hyq2003 发表于 2014-10-23 08:40:44 |只看作者 |坛友微信交流群
have a look

使用道具

16
fuwenge 在职认证  发表于 2014-10-23 09:06:40 |只看作者 |坛友微信交流群
不明觉厉

使用道具

17
yolandawine 发表于 2014-10-23 09:43:37 |只看作者 |坛友微信交流群
一看就好高大上。。。。。

使用道具

18
fjrong 在职认证  发表于 2014-10-23 10:12:15 |只看作者 |坛友微信交流群
好书,感谢分享

使用道具

19
gaojianwqjk 发表于 2014-10-23 12:25:04 |只看作者 |坛友微信交流群

使用道具

20
richardchan001 发表于 2014-10-24 00:40:25 |只看作者 |坛友微信交流群
xie xie
xie xie
xie xie

使用道具

您需要登录后才可以回帖 登录 | 我要注册

本版微信群
加好友,备注jltj
拉您入交流群

京ICP备16021002-2号 京B2-20170662号 京公网安备 11010802022788号 论坛法律顾问:王进律师 知识产权保护声明   免责及隐私声明

GMT+8, 2024-5-4 08:41