Top 20 Python Machine Learning Open Source Projects - 第2页

11楼

Lisrelchen 发表于 2016-4-7 09:31:55

import os, sys; sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
from pattern.web import Newsfeed, plaintext, URL
from pattern.db import date
# This example reads a given RSS or Atom newsfeed channel.
# Some example feeds to try out:
NATURE = "http://feeds.nature.com/nature/rss/current"
SCIENCE = "http://www.sciencemag.org/rss/podcast.xml"
NYT = "http://rss.nytimes.com/services/xml/rss/nyt/GlobalHome.xml"
TIME = "http://feeds.feedburner.com/time/topstories"
CNN = "http://rss.cnn.com/rss/edition.rss"
engine = Newsfeed()
for result in engine.search(CNN, cached=True):
print result.title.upper()
print plaintext(result.text) # Remove HTML formatting.
print result.url
print result.date
print
# News item URL's lead to the page with the full article.
# This page can have any kind of formatting.
# There is no default way to read it.
# But we could just download the source HTML and convert it to plain text:
#html = URL(result.url).download()
#print plaintext(html)
# The resulting text may contain a lot of garbage.
# A better way is to use a DOM parser to select the HTML elements we want.
# This is demonstrated in one of the next examples.

复制代码

12楼

Lisrelchen 发表于 2016-4-7 09:37:41

import os, sys; sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
from pattern.web import Wikipedia
# This example retrieves an article from Wikipedia (http://en.wikipedia.org).
# Wikipedia queries request the article HTML source from the server. This can be slow.
# It is a good idea to cache results from Wikipedia locally,
# and to set a high timeout when calling Wikipedia.search().
engine = Wikipedia(language="en")
# Contrary to the other search engines in the pattern.web module,
# Wikipedia simply returns one WikipediaArticle object (or None),
# instead of a list of results.
article = engine.search("alice in wonderland", cached=True, timeout=30)
print article.title # Article title (may differ from the search query).
print
print article.languages["fr"] # Article in French, can be retrieved with Wikipedia(language="fr").
print article.links[:10], "..." # List of linked Wikipedia articles.
print article.external[:5], "..." # List of external URL's.
print
#print article.source # The full article content as HTML.
#print article.string # The full article content, plain text with HTML tags stripped.
# An article is made up of different sections with a title.
# WikipediaArticle.sections is a list of WikipediaSection objects.
# Each section has a title + content and can have a linked parent section or child sections.
for s in article.sections:
print s.title.upper()
print
print s.content # = ArticleSection.string, minus the title.
print

复制代码

13楼

Lisrelchen 发表于 2016-4-7 09:38:46

import os, sys; sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
from pattern.web import Wiktionary, DOM
from pattern.db import csv, pd
# This example retrieves male and female given names from Wiktionary (http://en.wiktionary.org).
# It then trains a classifier that can predict the gender of unknown names (about 78% correct).
# The classifier is small (80KB) and fast.
w = Wiktionary(language="en")
f = csv() # csv() is a short alias for Datasheet().
# Collect male and female given names from Wiktionary.
# Store the data as (name, gender)-rows in a CSV-file.
# The pd() function returns the parent directory of the current script,
# so pd("given-names.csv") = pattern/examples/01-web/given-names.csv.
for gender in ("male", "female"):
for ch in ("abcdefghijklmnopqrstuvwxyz"):
p = w.search("Appendix:%s_given_names/%s" % (gender.capitalize(), ch.capitalize()), cached=True)
for name in p.links:
if not name.startswith("Appendix:"):
f.append((name, gender[0]))
f.save(pd("given-names.csv"))
print ch, gender
# Create a classifier that predicts gender based on name.
from pattern.vector import SVM, chngrams, count, kfoldcv
class GenderByName(SVM):
def train(self, name, gender=None):
SVM.train(self, self.vector(name), gender)
def classify(self, name):
return SVM.classify(self, self.vector(name))
def vector(self, name):
""" Returns a dictionary with character bigrams and suffix.
For example, "Felix" => {"Fe":1, "el":1, "li":1, "ix":1, "ix$":1, 5:1}
"""
v = chngrams(name, n=2)
v = count(v)
v[name[-2:]+"$"] = 1
v[len(name)] = 1
return v
data = csv(pd("given-names.csv"))
# Test average (accuracy, precision, recall, F-score, standard deviation).
print kfoldcv(GenderByName, data, folds=3) # (0.81, 0.79, 0.77, 0.78, 0.00)
# Train and save the classifier in the current folder.
# With final=True, discards the original training data (= smaller file).
g = GenderByName(train=data)
g.save(pd("gender-by-name.svm"), final=True)
# Next time, we can simply load the trained classifier.
# Keep in mind that the script that loads the classifier
# must include the code for the GenderByName class description,
# otherwise Python won't know how to load the data.
g = GenderByName.load(pd("gender-by-name.svm"))
for name in (
"Felix",
"Felicia",
"Rover",
"Kitty",
"Legolas",
"Arwen",
"Jabba",
"Leia",
"Flash",
"Barbarella"):
print name, g.classify(name)
# In the example above, Arwen and Jabba are misclassified.
# We can of course improve the classifier by hand:
#g.train("Arwen", gender="f")
#g.train("Jabba", gender="m")
#g.save(pd("gender-by-name.svm"), final=True)
#print g.classify("Arwen")
#print g.classify("Jabba")

复制代码

14楼

hanszhu 发表于 2016-4-7 10:12:15

# -*- coding: utf-8 *-*
import os, sys; sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
from pattern.web import Wikia
# This example retrieves articled from Wikia (http://www.wikia.com).
# Wikia is a collection of thousands of wikis based on MediaWiki.
# Wikipedia is based on MediaWiki too.
# Wikia queries request the article HTML source from the server. This can be slow.
domain = "monkeyisland" # "Look behind you, a three-headed monkey!"
# Alternatively, you can call this script from the commandline
# and specify another domain: python 09-wikia.py "Bieberpedia".
if len(sys.argv) > 1:
domain = sys.argv[1]
w = Wikia(domain, language="en")
# Like Wikipedia, we can search for articles by title with Wikia.search():
print w.search("Three Headed Monkey")
# However, we may not know exactly what kind of articles exist,
# three-headed monkey" for example does not redirect to the above article.
# We can iterate through all articles with the Wikia.articles() method
# (note that Wikipedia also has a Wikipedia.articles() method).
# The "count" parameter sets the number of article titles to retrieve per query.
# Retrieving the full article for each article takes another query. This can be slow.
i = 0
for article in w.articles(count=2, cached=True):
print
print article.title
#print article.plaintext()
i += 1
if i >= 3:
break
# Alternatively, we can retrieve just the titles,
# and only retrieve the full articles for the titles we need:
i = 0
for title in w.index(count=2):
print
print title
#article = w.search(title)
#print article.plaintext()
i += 1
if i >= 3:
break

复制代码

15楼

hanszhu 发表于 2016-4-7 10:13:13

# -*- coding: utf-8 *-*
import os, sys; sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
from pattern.web import DBPedia
dbp = DBPedia()
# DBPedia is a database of structured information mined from Wikipedia.
# DBPedia data is stored as RDF triples: (subject, predicate, object),
# e.g., X is-a Actor, Y is-a Country, Z has-birthplace Country, ...
# If you know about pattern.graph (or graphs in general),
# this triple format should look familiar.
# DBPedia can be queried using SPARQL:
# http://dbpedia.org/sparql
# http://www.w3.org/TR/rdf-sparql-query/
# A SPARQL query yields rows that match all triples in the WHERE clause.
# A SPARQL query uses ?wildcards in triple subject/object to select fields.
# 1) Search DBPedia for actors.
# Variables are indicated with a "?" prefix.
# Variables will be bound to the corresponding part of each matched triple.
# The "a" is short for "is of the class".
# The "prefix" statement creates a shorthand for a given namespace.
# To see what semantic constraints are available in "dbo" (for example):
# http://dbpedia.org/ontology/
q = """
prefix dbo: <http://dbpedia.org/ontology/>
select ?actor where {
?actor a dbo:Actor.
}
"""
for result in dbp.search(q, start=1, count=10):
print result.actor
print
# You may notice that each Result.actor is of the form:
# "http://dbpedia.org/resource/[NAME]"
# This kind of string is a subclass of unicode: DBPediaResource.
# DBPediaResource has a DBPediaResource.name property (see below).
# 2) Search DBPedia for actors and their place of birth.
q = """
prefix dbo: <http://dbpedia.org/ontology/>
select ?actor ?place where {
?actor a dbo:Actor.
?actor dbo:birthPlace ?place.
}
order by ?actor
"""
for r in dbp.search(q, start=1, count=10):
print "%s (%s)" % (r.actor.name, r.place.name)
print
# You will notice that the results now include duplicates,
# the same actor with a city name, and with a country name.
# We could refine ?place by including the following triple:
# "?place a dbo:Country."
# 3) Search DBPedia for actors born in 1970.
# Each result must match both triples, i.e.,
# X is an actor + X is born on Y.
# We don't want to filter by month and day (e.g., "1970-12-31"),
# so we use a regular expression instead with filter():
q = """
prefix dbo: <http://dbpedia.org/ontology/>
select ?actor ?date where {
?actor a dbo:Actor.
?actor dbo:birthDate ?date.
filter(regex(str(?date), "1970-..-.."))
}
order by ?date
"""
for r in dbp.search(q, start=1, count=10):
print "%s (%s)" % (r.actor.name, r.date)
print
# We could also make this query shorter,
# by combining the two ?actor triples into one:
# "?actor a dbo:Actor; dbo:birthDate ?date."
# 4) A more advanced example, in German:
q = """
prefix dbo: <http://dbpedia.org/ontology/>
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
select ?actor ?place where {
?_actor a dbo:Actor.
?_actor dbo:birthPlace ?_place.
?_actor rdfs:label ?actor.
?_place rdfs:label ?place.
filter(lang(?actor) = "de" && lang(?place) = "de")
}
order by ?actor
"""
for r in dbp.search(q, start=1, count=10):
print "%s (%s)" % (r.actor, r.place)
print
# This extracts a German label for each matched DBPedia resource.
# - X is an actor,
# - X is born in Y,
# - X has a German label A,
# - Y has a German label B,
# - Retrieve A and B.
# For example, say one of the matched resources was:
# "<http://dbpedia.org/page/Erwin_Schrödinger>"
# If you open this URL in a browser,
# you will see all the available semantic properties and their values.
# One of the properties is "rdfs:label": a human-readable & multilingual label.
# 5) Find triples involving cats.
# <http://purl.org/dc/terms/subject>
# means: "is in the category of".
q = """
prefix dbo: <http://dbpedia.org/ontology/>
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
select ?cat ?relation ?concept where {
?cat <http://purl.org/dc/terms/subject> <http://dbpedia.org/resource/Category:Felis>.
?cat ?_relation ?_concept.
?_relation rdfs:label ?relation.
?_concept rdfs:label ?concept.
filter(lang(?relation) = "en" && lang(?concept) = "en")
} order by ?cat
"""
for r in dbp.search(q, start=1, count=10):
print "%s ---%s--> %s" % (r.cat.name, r.relation.ljust(10, "-"), r.concept)
print
# 6) People whose first name includes "édouard"
q = u"""
prefix dbo: <http://dbpedia.org/ontology/>
prefix foaf: <http://xmlns.com/foaf/0.1/>
select ?person ?name where {
?person a dbo:Person.
?person foaf:givenName ?name.
filter(regex(?name, "édouard"))
}
"""
for result in dbp.search(q, start=1, count=10, cached=False):
print "%s (%s)" % (result.person.name, result.name)
print

复制代码

16楼

hanszhu 发表于 2016-4-7 10:14:13

import os, sys; sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
from pattern.web import Facebook, NEWS, COMMENTS, LIKES
from pattern.db import Datasheet, pprint, pd
# The Facebook API can be used to search public status updates (no license needed).
# It can also be used to get status updates, comments and persons that liked it,
# from a given profile or product page.
# This requires a personal license key.
# If you are logged in to Facebook, you can get a license key here:
# http://www.clips.ua.ac.be/pattern-facebook
# (We don't / can't store your information).
# 1) Searching for public status updates.
# Search for all status updates that contain the word "horrible".
try:
# We'll store the status updates in a Datasheet.
# A Datasheet is a table of rows and columns that can be exported as a CSV-file.
# In the first column, we'll store a unique id for each status update.
# We only want to add new status updates, i.e., those we haven't seen yet.
# With an index on the first column we can quickly check if an id already exists.
table = Datasheet.load(pd("opinions.csv"))
index = set(table.columns[0])
except:
table = Datasheet()
index = set()
fb = Facebook()
# With Facebook.search(cached=False), a "live" request is sent to Facebook:
# we get the most recent results instead of those in the local cache.
# Keeping a local cache can also be useful (e.g., while testing)
# because a query is instant when it is executed the second time.
for status in fb.search("horrible", count=25, cached=False):
print "=" * 100
print status.id
print status.text
print status.author # Yields an (id, name)-tuple.
print status.date
print status.likes
print status.comments
print
# Only add the tweet to the table if it doesn't already exists.
if len(table) == 0 or status.id not in index:
table.append([status.id, status.text])
index.add(status.id)
# Create a .csv in pattern/examples/01-web/
table.save(pd("opinions.csv"))
# 2) Status updates from specific profiles.
# For this you need a personal license key:
# http://www.clips.ua.ac.be/pattern-facebook
license = ""
if license != "":
fb = Facebook(license)
# Facebook.profile() returns a dictionary with author info.
# By default, this is your own profile.
# You can also supply the id of another profile,
# or the name of a product page.
me = fb.profile()["id"]
for status in fb.search(me, type=NEWS, count=30, cached=False):
print "-" * 100
print status.id # Status update unique id.
print status.title # Status title (i.e., the id of the page or event given as URL).
print status.text # Status update text.
print status.url # Status update image, external link, ...
if status.comments > 0:
# Retrieve comments on the status update.
print "%s comments:" % status.comments
print [(x.author, x.text, x.likes) for x in fb.search(status.id, type=COMMENTS)]
if status.likes > 0:
# Retrieve likes on the status update.
print "%s likes:" % status.likes
print [x.author for x in fb.search(status.id, type=LIKES)]
print

复制代码

17楼

hanszhu 发表于 2016-4-7 10:15:57

import os, sys; sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
from pattern.web import URL, DOM, plaintext
from pattern.web import NODE, TEXT, COMMENT, ELEMENT, DOCUMENT
# The pattern.web module has a number of convenient search engines, as demonstrated.
# But often you will need to handle the HTML in web pages of your interest manually.
# The DOM object can be used for this, similar to the Javascript DOM.
# The DOM (Document Object Model) parses a string of HTML
# and returns a tree of nested Element objects.
# The DOM elements can then be searched by tag name, CSS id, CSS class, ...
# For example, top news entries on Reddit are coded as:
# <div class="entry">
# <p class="title">
# <a class="title " href="http://i.imgur.com/yDyPu8P.jpg">Bagel the bengal, destroyer of boxes</a>
# ...
# </div>
#
# ... which - naturally - is a picture of a cat.
url = URL("http://www.reddit.com/top/")
dom = DOM(url.download(cached=True))
#print dom.body.content
for e in dom.by_tag("div.entry")[:5]: # Top 5 reddit entries.
for a in e.by_tag("a.title")[:1]: # First <a class="title"> in entry.
print plaintext(a.content)
print a.attrs["href"]
print
# The links in the HTML source code may be relative,
# e.g., "../img.jpg" instead of "www.domain.com/img.jpg".
# We can get the absolute URL by prepending the base URL.
# However, this can get messy with anchors, trailing slashes and redirected URL's.
# A good way to get absolute URL's is to use the module's abs() function:
from pattern.web import abs
url = URL("http://nodebox.net")
for link in DOM(url.download()).by_tag("a"):
link = link.attrs.get("href","")
link = abs(link, base=url.redirect or url.string)
#print link
# The DOM object is a tree of nested Element and Text objects.
# All objects inherit from Node (check the source code).
# Node.type : NODE, TEXT, COMMENT, ELEMENT or DOM
# Node.parent : Parent Node object.
# Node.children : List of child Node objects.
# Node.next : Next Node in Node.parent.children.
# Node.previous : Previous Node in Node.parent.children.
# DOM.head : Element with tag name "head".
# DOM.body : Element with tag name "body".
# Element.tag : Element tag name, e.g. "body".
# Element.attrs : Dictionary of tag attributes, e.g. {"class": "header"}
# Element.content : Element HTML content as a string.
# Element.source : Element tag + content
# Element.get_element_by_id(value)
# Element.get_elements_by_tagname(value)
# Element.get_elements_by_classname(value)
# Element.get_elements_by_attribute(name=value)
# You can also use shorter aliases (we prefer them):
# Element.by_id(), by_tag(), by_class(), by_attr().
# The tag name passed to Element.by_tag() can include
# a class (e.g., "div.message") or an id (e.g., "div#header").
# For example:
# In the <head> tag, retrieve the <meta name="keywords"> element.
# Get the string value of its "content" attribute and split into a list:
dom = DOM(URL("http://www.clips.ua.ac.be").download())
kw = dom.head.by_attr(name="keywords")[0]
kw = kw.attrs["content"]
kw = [x.strip() for x in kw.split(",")]
print kw
print
# If you know CSS, you can also use short and handy CSS selectors:
# http://www.w3.org/TR/CSS2/selector.html
# Element(selector) will return a list of nested elements that match the given string.
dom = DOM(URL("http://www.clips.ua.ac.be").download())
for e in dom("div#sidebar-left li div:first-child span"):
print plaintext(e.content)
print

复制代码

18楼

hanszhu 发表于 2016-4-7 10:17:50

加关注串个门加好友发消息 0关注 463 粉丝巨擘 Nicolle 当前离线阅读权限 255 威望 16 级论坛币 12403159 个通用积分 1639.2132 学术水平 3305 点热心指数 3329 点信用等级 3095 点经验 476993 点帖子 23839 精华 91 在线时间 9878 小时注册时间 2005-4-23 最后登录 2022-3-6 雷达卡	19楼 Nicolle 发表于 2016-4-7 10:37:25 提示: 作者被禁止或删除内容自动屏蔽

	回复举报

加关注串个门加好友发消息 0关注 463 粉丝巨擘 Nicolle 当前离线阅读权限 255 威望 16 级论坛币 12403159 个通用积分 1639.2132 学术水平 3305 点热心指数 3329 点信用等级 3095 点经验 476993 点帖子 23839 精华 91 在线时间 9878 小时注册时间 2005-4-23 最后登录 2022-3-6 雷达卡	20楼 Nicolle 发表于 2016-4-7 10:39:15 提示: 作者被禁止或删除内容自动屏蔽

	回复举报

Top 20 Python Machine Learning Open Source Projects [推广有奖]

浏览过的帖子

浏览过的版块

本版微信群