签到
- 苹果/安卓/wp
- 苹果/安卓/wp
客户端
0.0

0.00

人大经济论坛 › 论坛 › 计量经济学与统计论坛五区 › 计量经济学与统计软件 › winbugs及其他软件专版 › Programming Collective Intelligence

CDA数据分析研究院

商业数据分析与大数据领航教育品牌



经管云课堂

经管/金融/财会/社科/名师公开课



学术培训

Stata 空间计量 SSCI Python

贵宾：通行论坛特权+数据库权限
+案例库+下载特权 VIP：论坛特权+更多下载次数
+ccerdata数据库+更高阅读权限+……

上一页 123 下一页

提升主题| 本版置顶| 关闭主题| 变更主题颜色| 抢沙发| 顶贴| 道具中心

楼主: ReneeBK

7458 25

Programming Collective Intelligence [推广有奖]

11楼

andrewfu1988 发表于 2014-8-31 09:22:13 |只看作者 |坛友微信交流群

Chapter 5:Optimization

import time
import urllib2
import xml.dom.minidom
kayakkey='YOUR KEY HERE'
def getkayaksession():
# Construct the URL to start a session
url='http://www.kayak.com/k/ident/apisession?token=%s&version=1' % kayakkey
# Parse the resulting XML
doc=xml.dom.minidom.parseString(urllib2.urlopen(url).read())
# Find <sid>xxxxxxxx</sid>
sid=doc.getElementsByTagName('sid')[0].firstChild.data
return sid
def flightsearch(sid,origin,destination,depart_date):
# Construct search URL
url='http://www.kayak.com/s/apisearch?basicmode=true&oneway=y&origin=%s' % origin
url+='&destination=%s&depart_date=%s' % (destination,depart_date)
url+='&return_date=none&depart_time=a&return_time=a'
url+='&travelers=1&cabin=e&action=doFlights&apimode=1'
url+='&_sid_=%s&version=1' % (sid)
# Get the XML
doc=xml.dom.minidom.parseString(urllib2.urlopen(url).read())
# Extract the search ID
searchid=doc.getElementsByTagName('searchid')[0].firstChild.data
return searchid
def flightsearchresults(sid,searchid):
def parseprice(p):
return float(p[1:].replace(',',''))
# Polling loop
while 1:
time.sleep(2)
# Construct URL for polling
url='http://www.kayak.com/s/basic/flight?'
url+='searchid=%s&c=5&apimode=1&_sid_=%s&version=1' % (searchid,sid)
doc=xml.dom.minidom.parseString(urllib2.urlopen(url).read())
# Look for morepending tag, and wait until it is no longer true
morepending=doc.getElementsByTagName('morepending')[0].firstChild
if morepending==None or morepending.data=='false': break
# Now download the complete list
url='http://www.kayak.com/s/basic/flight?'
url+='searchid=%s&c=999&apimode=1&_sid_=%s&version=1' % (searchid,sid)
doc=xml.dom.minidom.parseString(urllib2.urlopen(url).read())
# Get the various elements as lists
prices=doc.getElementsByTagName('price')
departures=doc.getElementsByTagName('depart')
arrivals=doc.getElementsByTagName('arrive')
# Zip them together
return zip([p.firstChild.data.split(' ')[1] for p in departures],
[p.firstChild.data.split(' ')[1] for p in arrivals],
[parseprice(p.firstChild.data) for p in prices])
def createschedule(people,dest,dep,ret):
# Get a session id for these searches
sid=getkayaksession()
flights={}
for p in people:
name,origin=p
# Outbound flight
searchid=flightsearch(sid,origin,dest,dep)
flights[(origin,dest)]=flightsearchresults(sid,searchid)
# Return flight
searchid=flightsearch(sid,dest,origin,ret)
flights[(dest,origin)]=flightsearchresults(sid,searchid)
return flights

复制代码

回复

使用道具举报

12楼

life_life 发表于 2014-9-1 19:44:58 |只看作者 |坛友微信交流群

Chapter 5: Optimization

import random
import math
# The dorms, each of which has two available spaces
dorms=['Zeus','Athena','Hercules','Bacchus','Pluto']
# People, along with their first and second choices
prefs=[('Toby', ('Bacchus', 'Hercules')),
('Steve', ('Zeus', 'Pluto')),
('Karen', ('Athena', 'Zeus')),
('Sarah', ('Zeus', 'Pluto')),
('Dave', ('Athena', 'Bacchus')),
('Jeff', ('Hercules', 'Pluto')),
('Fred', ('Pluto', 'Athena')),
('Suzie', ('Bacchus', 'Hercules')),
('Laura', ('Bacchus', 'Hercules')),
('James', ('Hercules', 'Athena'))]
# [(0,9),(0,8),(0,7),(0,6),...,(0,0)]
domain=[(0,(len(dorms)*2)-i-1) for i in range(0,len(dorms)*2)]
def printsolution(vec):
slots=[]
# Create two slots for each dorm
for i in range(len(dorms)): slots+=[i,i]
# Loop over each students assignment
for i in range(len(vec)):
x=int(vec[i])
# Choose the slot from the remaining ones
dorm=dorms[slots[x]]
# Show the student and assigned dorm
print prefs[i][0],dorm
# Remove this slot
del slots[x]
def dormcost(vec):
cost=0
# Create list a of slots
slots=[0,0,1,1,2,2,3,3,4,4]
# Loop over each student
for i in range(len(vec)):
x=int(vec[i])
dorm=dorms[slots[x]]
pref=prefs[i][1]
# First choice costs 0, second choice costs 1
if pref[0]==dorm: cost+=0
elif pref[1]==dorm: cost+=1
else: cost+=3
# Not on the list costs 3
# Remove selected slot
del slots[x]
return cost

复制代码

回复

使用道具举报

13楼

lenny1219 发表于 2014-9-22 03:48:54 |只看作者 |坛友微信交流群

Chapter 6: Document Filtering

import feedparser
import re
# Takes a filename of URL of a blog feed and classifies the entries
def read(feed,classifier):
# Get feed entries and loop over them
f=feedparser.parse(feed)
for entry in f['entries']:
print
print '-----'
# Print the contents of the entry
print 'Title: '+entry['title'].encode('utf-8')
print 'Publisher: '+entry['publisher'].encode('utf-8')
print
print entry['summary'].encode('utf-8')
# Combine all the text to create one item for the classifier
fulltext='%s\n%s\n%s' % (entry['title'],entry['publisher'],entry['summary'])
# Print the best guess at the current category
print 'Guess: '+str(classifier.classify(entry))
# Ask the user to specify the correct category and train on that
cl=raw_input('Enter category: ')
classifier.train(entry,cl)
def entryfeatures(entry):
splitter=re.compile('\\W*')
f={}
# Extract the title words and annotate
titlewords=[s.lower() for s in splitter.split(entry['title'])
if len(s)>2 and len(s)<20]
for w in titlewords: f['Title:'+w]=1
# Extract the summary words
summarywords=[s.lower() for s in splitter.split(entry['summary'])
if len(s)>2 and len(s)<20]
# Count uppercase words
uc=0
for i in range(len(summarywords)):
w=summarywords[i]
f[w]=1
if w.isupper(): uc+=1
# Get word pairs in summary as features
if i<len(summarywords)-1:
twowords=' '.join(summarywords[i:i+1])
f[twowords]=1
# Keep creator and publisher whole
f['Publisher:'+entry['publisher']]=1
# UPPERCASE is a virtual word flagging too much shouting
if float(uc)/len(summarywords)>0.3: f['UPPERCASE']=1
return f

复制代码

回复

使用道具举报

14楼

Lisrelchen 发表于 2014-10-16 10:40:05 |只看作者 |坛友微信交流群

Chapter6: Document Filtering

from pysqlite2 import dbapi2 as sqlite
import re
import math
def getwords(doc):
splitter=re.compile('\\W*')
print doc
# Split the words by non-alpha characters
words=[s.lower() for s in splitter.split(doc)
if len(s)>2 and len(s)<20]
# Return the unique set of words only
return dict([(w,1) for w in words])
class classifier:
def __init__(self,getfeatures,filename=None):
# Counts of feature/category combinations
self.fc={}
# Counts of documents in each category
self.cc={}
self.getfeatures=getfeatures
def setdb(self,dbfile):
self.con=sqlite.connect(dbfile)
self.con.execute('create table if not exists fc(feature,category,count)')
self.con.execute('create table if not exists cc(category,count)')
def incf(self,f,cat):
count=self.fcount(f,cat)
if count==0:
self.con.execute("insert into fc values ('%s','%s',1)"
% (f,cat))
else:
self.con.execute(
"update fc set count=%d where feature='%s' and category='%s'"
% (count+1,f,cat))
def fcount(self,f,cat):
res=self.con.execute(
'select count from fc where feature="%s" and category="%s"'
%(f,cat)).fetchone()
if res==None: return 0
else: return float(res[0])
def incc(self,cat):
count=self.catcount(cat)
if count==0:
self.con.execute("insert into cc values ('%s',1)" % (cat))
else:
self.con.execute("update cc set count=%d where category='%s'"
% (count+1,cat))
def catcount(self,cat):
res=self.con.execute('select count from cc where category="%s"'
%(cat)).fetchone()
if res==None: return 0
else: return float(res[0])
def categories(self):
cur=self.con.execute('select category from cc');
return [d[0] for d in cur]
def totalcount(self):
res=self.con.execute('select sum(count) from cc').fetchone();
if res==None: return 0
return res[0]
def train(self,item,cat):
features=self.getfeatures(item)
# Increment the count for every feature with this category
for f in features:
self.incf(f,cat)
# Increment the count for this category
self.incc(cat)
self.con.commit()
def fprob(self,f,cat):
if self.catcount(cat)==0: return 0
# The total number of times this feature appeared in this
# category divided by the total number of items in this category
return self.fcount(f,cat)/self.catcount(cat)
def weightedprob(self,f,cat,prf,weight=1.0,ap=0.5):
# Calculate current probability
basicprob=prf(f,cat)
# Count the number of times this feature has appeared in
# all categories
totals=sum([self.fcount(f,c) for c in self.categories()])
# Calculate the weighted average
bp=((weight*ap)+(totals*basicprob))/(weight+totals)
return bp
class naivebayes(classifier):
def __init__(self,getfeatures):
classifier.__init__(self,getfeatures)
self.thresholds={}
def docprob(self,item,cat):
features=self.getfeatures(item)
# Multiply the probabilities of all the features together
p=1
for f in features: p*=self.weightedprob(f,cat,self.fprob)
return p
def prob(self,item,cat):
catprob=self.catcount(cat)/self.totalcount()
docprob=self.docprob(item,cat)
return docprob*catprob
def setthreshold(self,cat,t):
self.thresholds[cat]=t
def getthreshold(self,cat):
if cat not in self.thresholds: return 1.0
return self.thresholds[cat]
def classify(self,item,default=None):
probs={}
# Find the category with the highest probability
max=0.0
for cat in self.categories():
probs[cat]=self.prob(item,cat)
if probs[cat]>max:
max=probs[cat]
best=cat
# Make sure the probability exceeds threshold*next best
for cat in probs:
if cat==best: continue
if probs[cat]*self.getthreshold(best)>probs[best]: return default
return best
class fisherclassifier(classifier):
def cprob(self,f,cat):
# The frequency of this feature in this category
clf=self.fprob(f,cat)
if clf==0: return 0
# The frequency of this feature in all the categories
freqsum=sum([self.fprob(f,c) for c in self.categories()])
# The probability is the frequency in this category divided by
# the overall frequency
p=clf/(freqsum)
return p
def fisherprob(self,item,cat):
# Multiply all the probabilities together
p=1
features=self.getfeatures(item)
for f in features:
p*=(self.weightedprob(f,cat,self.cprob))
# Take the natural log and multiply by -2
fscore=-2*math.log(p)
# Use the inverse chi2 function to get a probability
return self.invchi2(fscore,len(features)*2)
def invchi2(self,chi, df):
m = chi / 2.0
sum = term = math.exp(-m)
for i in range(1, df//2):
term *= m / i
sum += term
return min(sum, 1.0)
def __init__(self,getfeatures):
classifier.__init__(self,getfeatures)
self.minimums={}
def setminimum(self,cat,min):
self.minimums[cat]=min
def getminimum(self,cat):
if cat not in self.minimums: return 0
return self.minimums[cat]
def classify(self,item,default=None):
# Loop through looking for the best result
best=default
max=0.0
for c in self.categories():
p=self.fisherprob(item,c)
# Make sure it exceeds its minimum
if p>self.getminimum(c) and p>max:
best=c
max=p
return best
def sampletrain(cl):
cl.train('Nobody owns the water.','good')
cl.train('the quick rabbit jumps fences','good')
cl.train('buy pharmaceuticals now','bad')
cl.train('make quick money at the online casino','bad')
cl.train('the quick brown fox jumps','good')

复制代码

回复

使用道具举报

15楼

hyq2003 发表于 2014-10-23 08:40:44 |只看作者 |坛友微信交流群

have a look

回复

使用道具举报

16楼

在职认证

发表于 2014-10-23 09:06:40 |只看作者 |坛友微信交流群

不明觉厉

回复

使用道具举报

17楼

yolandawine 发表于 2014-10-23 09:43:37 |只看作者 |坛友微信交流群

一看就好高大上。。。。。

回复

使用道具举报

18楼

在职认证

发表于 2014-10-23 10:12:15 |只看作者 |坛友微信交流群

好书，感谢分享

回复

使用道具举报

19楼

gaojianwqjk 发表于 2014-10-23 12:25:04 |只看作者 |坛友微信交流群

回复

使用道具举报

20楼

richardchan001 发表于 2014-10-24 00:40:25 |只看作者 |坛友微信交流群

xie xie
xie xie
xie xie

回复

使用道具举报

上一页 123 下一页

发帖

本版微信群

加好友,备注jltj
拉您入交流群

如有投资本站、合作意向或投放广告，请联系：13661292478（刘老师）

联系客服

邮箱：service@pinggu.org 投诉或不良信息处理：（010-68466864）

京ICP备16021002-2号京B2-20170662号京公网安备 11010802022788号论坛法律顾问：王进律师知识产权保护声明免责及隐私声明