楼主: Lisrelchen
2954 19

Top 20 Python Machine Learning Open Source Projects [推广有奖]

11
Lisrelchen 发表于 2016-4-7 09:31:55
  1. import os, sys; sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))

  2. from pattern.web import Newsfeed, plaintext, URL
  3. from pattern.db  import date

  4. # This example reads a given RSS or Atom newsfeed channel.
  5. # Some example feeds to try out:
  6. NATURE  = "http://feeds.nature.com/nature/rss/current"
  7. SCIENCE = "http://www.sciencemag.org/rss/podcast.xml"
  8. NYT     = "http://rss.nytimes.com/services/xml/rss/nyt/GlobalHome.xml"
  9. TIME    = "http://feeds.feedburner.com/time/topstories"
  10. CNN     = "http://rss.cnn.com/rss/edition.rss"

  11. engine = Newsfeed()

  12. for result in engine.search(CNN, cached=True):
  13.     print result.title.upper()
  14.     print plaintext(result.text) # Remove HTML formatting.
  15.     print result.url
  16.     print result.date
  17.     print

  18. # News item URL's lead to the page with the full article.
  19. # This page can have any kind of formatting.
  20. # There is no default way to read it.
  21. # But we could just download the source HTML and convert it to plain text:

  22. #html = URL(result.url).download()
  23. #print plaintext(html)

  24. # The resulting text may contain a lot of garbage.
  25. # A better way is to use a DOM parser to select the HTML elements we want.
  26. # This is demonstrated in one of the next examples.
复制代码

12
Lisrelchen 发表于 2016-4-7 09:37:41
  1. import os, sys; sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))

  2. from pattern.web import Wikipedia

  3. # This example retrieves an article from Wikipedia (http://en.wikipedia.org).
  4. # Wikipedia queries request the article HTML source from the server. This can be slow.
  5. # It is a good idea to cache results from Wikipedia locally,
  6. # and to set a high timeout when calling Wikipedia.search().

  7. engine = Wikipedia(language="en")

  8. # Contrary to the other search engines in the pattern.web module,
  9. # Wikipedia simply returns one WikipediaArticle object (or None),
  10. # instead of a list of results.
  11. article = engine.search("alice in wonderland", cached=True, timeout=30)

  12. print article.title               # Article title (may differ from the search query).
  13. print
  14. print article.languages["fr"]     # Article in French, can be retrieved with Wikipedia(language="fr").
  15. print article.links[:10], "..."   # List of linked Wikipedia articles.
  16. print article.external[:5], "..." # List of external URL's.
  17. print

  18. #print article.source # The full article content as HTML.
  19. #print article.string # The full article content, plain text with HTML tags stripped.

  20. # An article is made up of different sections with a title.
  21. # WikipediaArticle.sections is a list of WikipediaSection objects.
  22. # Each section has a title + content and can have a linked parent section or child sections.
  23. for s in article.sections:
  24.     print s.title.upper()
  25.     print
  26.     print s.content # = ArticleSection.string, minus the title.
  27.     print
复制代码

13
Lisrelchen 发表于 2016-4-7 09:38:46
  1. import os, sys; sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))

  2. from pattern.web import Wiktionary, DOM
  3. from pattern.db import csv, pd

  4. # This example retrieves male and female given names from Wiktionary (http://en.wiktionary.org).
  5. # It then trains a classifier that can predict the gender of unknown names (about 78% correct).
  6. # The classifier is small (80KB) and fast.

  7. w = Wiktionary(language="en")
  8. f = csv() # csv() is a short alias for Datasheet().

  9. # Collect male and female given names from Wiktionary.
  10. # Store the data as (name, gender)-rows in a CSV-file.
  11. # The pd() function returns the parent directory of the current script,
  12. # so pd("given-names.csv") = pattern/examples/01-web/given-names.csv.

  13. for gender in ("male", "female"):
  14.     for ch in ("abcdefghijklmnopqrstuvwxyz"):
  15.         p = w.search("Appendix:%s_given_names/%s" % (gender.capitalize(), ch.capitalize()), cached=True)
  16.         for name in p.links:
  17.             if not name.startswith("Appendix:"):
  18.                 f.append((name, gender[0]))
  19.         f.save(pd("given-names.csv"))
  20.         print ch, gender

  21. # Create a classifier that predicts gender based on name.

  22. from pattern.vector import SVM, chngrams, count, kfoldcv

  23. class GenderByName(SVM):

  24.     def train(self, name, gender=None):
  25.         SVM.train(self, self.vector(name), gender)

  26.     def classify(self, name):
  27.         return SVM.classify(self, self.vector(name))

  28.     def vector(self, name):
  29.         """ Returns a dictionary with character bigrams and suffix.
  30.             For example, "Felix" => {"Fe":1, "el":1, "li":1, "ix":1, "ix$":1, 5:1}
  31.         """
  32.         v = chngrams(name, n=2)
  33.         v = count(v)
  34.         v[name[-2:]+"$"] = 1
  35.         v[len(name)] = 1
  36.         return v

  37. data = csv(pd("given-names.csv"))

  38. # Test average (accuracy, precision, recall, F-score, standard deviation).

  39. print kfoldcv(GenderByName, data, folds=3) # (0.81, 0.79, 0.77, 0.78, 0.00)

  40. # Train and save the classifier in the current folder.
  41. # With final=True, discards the original training data (= smaller file).

  42. g = GenderByName(train=data)
  43. g.save(pd("gender-by-name.svm"), final=True)

  44. # Next time, we can simply load the trained classifier.
  45. # Keep in mind that the script that loads the classifier
  46. # must include the code for the GenderByName class description,
  47. # otherwise Python won't know how to load the data.

  48. g = GenderByName.load(pd("gender-by-name.svm"))

  49. for name in (
  50.   "Felix",
  51.   "Felicia",
  52.   "Rover",
  53.   "Kitty",
  54.   "Legolas",
  55.   "Arwen",
  56.   "Jabba",
  57.   "Leia",
  58.   "Flash",
  59.   "Barbarella"):
  60.     print name, g.classify(name)

  61. # In the example above, Arwen and Jabba are misclassified.
  62. # We can of course improve the classifier by hand:

  63. #g.train("Arwen", gender="f")
  64. #g.train("Jabba", gender="m")
  65. #g.save(pd("gender-by-name.svm"), final=True)
  66. #print g.classify("Arwen")
  67. #print g.classify("Jabba")
复制代码

14
hanszhu 发表于 2016-4-7 10:12:15
  1. # -*- coding: utf-8 *-*
  2. import os, sys; sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))

  3. from pattern.web import Wikia

  4. # This example retrieves articled from Wikia (http://www.wikia.com).
  5. # Wikia is a collection of thousands of wikis based on MediaWiki.
  6. # Wikipedia is based on MediaWiki too.
  7. # Wikia queries request the article HTML source from the server. This can be slow.

  8. domain = "monkeyisland" # "Look behind you, a three-headed monkey!"

  9. # Alternatively, you can call this script from the commandline
  10. # and specify another domain: python 09-wikia.py "Bieberpedia".
  11. if len(sys.argv) > 1:
  12.     domain = sys.argv[1]

  13. w = Wikia(domain, language="en")

  14. # Like Wikipedia, we can search for articles by title with Wikia.search():
  15. print w.search("Three Headed Monkey")

  16. # However, we may not know exactly what kind of articles exist,
  17. # three-headed monkey" for example does not redirect to the above article.

  18. # We can iterate through all articles with the Wikia.articles() method
  19. # (note that Wikipedia also has a Wikipedia.articles() method).
  20. # The "count" parameter sets the number of article titles to retrieve per query.
  21. # Retrieving the full article for each article takes another query. This can be slow.
  22. i = 0
  23. for article in w.articles(count=2, cached=True):
  24.     print
  25.     print article.title
  26.     #print article.plaintext()
  27.     i += 1
  28.     if i >= 3:
  29.         break

  30. # Alternatively, we can retrieve just the titles,
  31. # and only retrieve the full articles for the titles we need:
  32. i = 0
  33. for title in w.index(count=2):
  34.     print
  35.     print title
  36.     #article = w.search(title)
  37.     #print article.plaintext()
  38.     i += 1
  39.     if i >= 3:
  40.         break
复制代码

15
hanszhu 发表于 2016-4-7 10:13:13
  1. # -*- coding: utf-8 *-*
  2. import os, sys; sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))

  3. from pattern.web import DBPedia

  4. dbp = DBPedia()

  5. # DBPedia is a database of structured information mined from Wikipedia.
  6. # DBPedia data is stored as RDF triples: (subject, predicate, object),
  7. # e.g., X is-a Actor, Y is-a Country, Z has-birthplace Country, ...
  8. # If you know about pattern.graph (or graphs in general),
  9. # this triple format should look familiar.

  10. # DBPedia can be queried using SPARQL:
  11. # http://dbpedia.org/sparql
  12. # http://www.w3.org/TR/rdf-sparql-query/
  13. # A SPARQL query yields rows that match all triples in the WHERE clause.
  14. # A SPARQL query uses ?wildcards in triple subject/object to select fields.

  15. # 1) Search DBPedia for actors.

  16. # Variables are indicated with a "?" prefix.
  17. # Variables will be bound to the corresponding part of each matched triple.
  18. # The "a" is short for "is of the class".
  19. # The "prefix" statement creates a shorthand for a given namespace.
  20. # To see what semantic constraints are available in "dbo" (for example):
  21. # http://dbpedia.org/ontology/
  22. q = """
  23. prefix dbo: <http://dbpedia.org/ontology/>
  24. select ?actor where {
  25.     ?actor a dbo:Actor.
  26. }
  27. """
  28. for result in dbp.search(q, start=1, count=10):
  29.     print result.actor
  30. print
  31.    
  32. # You may notice that each Result.actor is of the form:
  33. # "http://dbpedia.org/resource/[NAME]"
  34. # This kind of string is a subclass of unicode: DBPediaResource.
  35. # DBPediaResource has a DBPediaResource.name property (see below).

  36. # 2) Search DBPedia for actors and their place of birth.

  37. q = """
  38. prefix dbo: <http://dbpedia.org/ontology/>
  39. select ?actor ?place where {
  40.     ?actor a dbo:Actor.
  41.     ?actor dbo:birthPlace ?place.
  42. }
  43. order by ?actor
  44. """
  45. for r in dbp.search(q, start=1, count=10):
  46.     print "%s (%s)" % (r.actor.name, r.place.name)
  47. print

  48. # You will notice that the results now include duplicates,
  49. # the same actor with a city name, and with a country name.
  50. # We could refine ?place by including the following triple:
  51. # "?place a dbo:Country."

  52. # 3) Search DBPedia for actors born in 1970.

  53. # Each result must match both triples, i.e.,
  54. # X is an actor + X is born on Y.
  55. # We don't want to filter by month and day (e.g., "1970-12-31"),
  56. # so we use a regular expression instead with filter():
  57. q = """
  58. prefix dbo: <http://dbpedia.org/ontology/>
  59. select ?actor ?date where {
  60.     ?actor a dbo:Actor.
  61.     ?actor dbo:birthDate ?date.
  62.     filter(regex(str(?date), "1970-..-.."))
  63. }
  64. order by ?date
  65. """
  66. for r in dbp.search(q, start=1, count=10):
  67.     print "%s (%s)" % (r.actor.name, r.date)
  68. print

  69. # We could also make this query shorter,
  70. # by combining the two ?actor triples into one:
  71. # "?actor a dbo:Actor; dbo:birthDate ?date."

  72. # 4) A more advanced example, in German:

  73. q = """
  74. prefix dbo: <http://dbpedia.org/ontology/>
  75. prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
  76. select ?actor ?place where {
  77.     ?_actor a dbo:Actor.
  78.     ?_actor dbo:birthPlace ?_place.
  79.     ?_actor rdfs:label ?actor.
  80.     ?_place rdfs:label ?place.
  81.     filter(lang(?actor) = "de" && lang(?place) = "de")
  82. }
  83. order by ?actor
  84. """
  85. for r in dbp.search(q, start=1, count=10):
  86.     print "%s (%s)" % (r.actor, r.place)
  87. print

  88. # This extracts a German label for each matched DBPedia resource.
  89. # - X is an actor,
  90. # - X is born in Y,
  91. # - X has a German label A,
  92. # - Y has a German label B,
  93. # - Retrieve A and B.

  94. # For example, say one of the matched resources was:
  95. # "<http://dbpedia.org/page/Erwin_Schrödinger>"
  96. # If you open this URL in a browser,
  97. # you will see all the available semantic properties and their values.
  98. # One of the properties is "rdfs:label": a human-readable & multilingual label.

  99. # 5) Find triples involving cats.

  100. # <http://purl.org/dc/terms/subject>
  101. # means: "is in the category of".
  102. q = """
  103. prefix dbo: <http://dbpedia.org/ontology/>
  104. prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
  105. select ?cat ?relation ?concept where {
  106.     ?cat <http://purl.org/dc/terms/subject> <http://dbpedia.org/resource/Category:Felis>.
  107.     ?cat ?_relation ?_concept.
  108.     ?_relation rdfs:label ?relation.
  109.     ?_concept rdfs:label ?concept.
  110.     filter(lang(?relation) = "en" && lang(?concept) = "en")
  111. } order by ?cat
  112. """
  113. for r in dbp.search(q, start=1, count=10):
  114.     print "%s ---%s--> %s" % (r.cat.name, r.relation.ljust(10, "-"), r.concept)
  115. print

  116. # 6) People whose first name includes "édouard"

  117. q = u"""
  118. prefix dbo: <http://dbpedia.org/ontology/>
  119. prefix foaf: <http://xmlns.com/foaf/0.1/>
  120. select ?person ?name where {
  121.     ?person a dbo:Person.
  122.     ?person foaf:givenName ?name.
  123.     filter(regex(?name, "édouard"))
  124. }
  125. """
  126. for result in dbp.search(q, start=1, count=10, cached=False):
  127.     print "%s (%s)" % (result.person.name, result.name)
  128. print
复制代码

16
hanszhu 发表于 2016-4-7 10:14:13
  1. import os, sys; sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))

  2. from pattern.web import Facebook, NEWS, COMMENTS, LIKES
  3. from pattern.db  import Datasheet, pprint, pd

  4. # The Facebook API can be used to search public status updates (no license needed).

  5. # It can also be used to get status updates, comments and persons that liked it,
  6. # from a given profile or product page.
  7. # This requires a personal license key.
  8. # If you are logged in to Facebook, you can get a license key here:
  9. # http://www.clips.ua.ac.be/pattern-facebook
  10. # (We don't / can't store your information).

  11. # 1) Searching for public status updates.
  12. #    Search for all status updates that contain the word "horrible".

  13. try:
  14.     # We'll store the status updates in a Datasheet.
  15.     # A Datasheet is a table of rows and columns that can be exported as a CSV-file.
  16.     # In the first column, we'll store a unique id for each status update.
  17.     # We only want to add new status updates, i.e., those we haven't seen yet.
  18.     # With an index on the first column we can quickly check if an id already exists.
  19.     table = Datasheet.load(pd("opinions.csv"))
  20.     index = set(table.columns[0])
  21. except:
  22.     table = Datasheet()
  23.     index = set()

  24. fb = Facebook()

  25. # With Facebook.search(cached=False), a "live" request is sent to Facebook:
  26. # we get the most recent results instead of those in the local cache.
  27. # Keeping a local cache can also be useful (e.g., while testing)
  28. # because a query is instant when it is executed the second time.
  29. for status in fb.search("horrible", count=25, cached=False):
  30.     print "=" * 100
  31.     print status.id
  32.     print status.text
  33.     print status.author # Yields an (id, name)-tuple.
  34.     print status.date
  35.     print status.likes
  36.     print status.comments
  37.     print
  38.     # Only add the tweet to the table if it doesn't already exists.
  39.     if len(table) == 0 or status.id not in index:
  40.         table.append([status.id, status.text])
  41.         index.add(status.id)

  42. # Create a .csv in pattern/examples/01-web/
  43. table.save(pd("opinions.csv"))

  44. # 2) Status updates from specific profiles.
  45. #    For this you need a personal license key:
  46. #    http://www.clips.ua.ac.be/pattern-facebook

  47. license = ""

  48. if license != "":
  49.     fb = Facebook(license)
  50.     # Facebook.profile() returns a dictionary with author info.
  51.     # By default, this is your own profile.
  52.     # You can also supply the id of another profile,
  53.     # or the name of a product page.
  54.     me = fb.profile()["id"]
  55.     for status in fb.search(me, type=NEWS, count=30, cached=False):
  56.         print "-" * 100
  57.         print status.id    # Status update unique id.
  58.         print status.title # Status title (i.e., the id of the page or event given as URL).
  59.         print status.text  # Status update text.
  60.         print status.url   # Status update image, external link, ...
  61.         if status.comments > 0:
  62.             # Retrieve comments on the status update.
  63.             print "%s comments:" % status.comments
  64.             print [(x.author, x.text, x.likes) for x in fb.search(status.id, type=COMMENTS)]
  65.         if status.likes > 0:
  66.             # Retrieve likes on the status update.
  67.             print "%s likes:" % status.likes
  68.             print [x.author for x in fb.search(status.id, type=LIKES)]
  69.         print
复制代码

17
hanszhu 发表于 2016-4-7 10:15:57
  1. import os, sys; sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))

  2. from pattern.web import URL, DOM, plaintext
  3. from pattern.web import NODE, TEXT, COMMENT, ELEMENT, DOCUMENT

  4. # The pattern.web module has a number of convenient search engines, as demonstrated.
  5. # But often you will need to handle the HTML in web pages of your interest manually.
  6. # The DOM object can be used for this, similar to the Javascript DOM.
  7. # The DOM (Document Object Model) parses a string of HTML
  8. # and returns a tree of nested Element objects.
  9. # The DOM elements can then be searched by tag name, CSS id, CSS class, ...

  10. # For example, top news entries on Reddit are coded as:
  11. # <div class="entry">
  12. #     <p class="title">
  13. #         <a class="title " href="http://i.imgur.com/yDyPu8P.jpg">Bagel the bengal, destroyer of boxes</a>
  14. #     ...
  15. # </div>
  16. #
  17. # ... which - naturally - is a picture of a cat.
  18. url = URL("http://www.reddit.com/top/")
  19. dom = DOM(url.download(cached=True))
  20. #print dom.body.content
  21. for e in dom.by_tag("div.entry")[:5]: # Top 5 reddit entries.
  22.     for a in e.by_tag("a.title")[:1]: # First <a class="title"> in entry.
  23.         print plaintext(a.content)
  24.         print a.attrs["href"]
  25.         print
  26.         
  27. # The links in the HTML source code may be relative,
  28. # e.g., "../img.jpg" instead of "www.domain.com/img.jpg".
  29. # We can get the absolute URL by prepending the base URL.
  30. # However, this can get messy with anchors, trailing slashes and redirected URL's.
  31. # A good way to get absolute URL's is to use the module's abs() function:
  32. from pattern.web import abs
  33. url = URL("http://nodebox.net")
  34. for link in DOM(url.download()).by_tag("a"):
  35.     link = link.attrs.get("href","")
  36.     link = abs(link, base=url.redirect or url.string)
  37.     #print link

  38. # The DOM object is a tree of nested Element and Text objects.
  39. # All objects inherit from Node (check the source code).

  40. # Node.type       : NODE, TEXT, COMMENT, ELEMENT or DOM
  41. # Node.parent     : Parent Node object.
  42. # Node.children   : List of child Node objects.
  43. # Node.next       : Next Node in Node.parent.children.
  44. # Node.previous   : Previous Node in Node.parent.children.

  45. # DOM.head        : Element with tag name "head".
  46. # DOM.body        : Element with tag name "body".

  47. # Element.tag     : Element tag name, e.g. "body".
  48. # Element.attrs   : Dictionary of tag attributes, e.g. {"class": "header"}
  49. # Element.content : Element HTML content as a string.
  50. # Element.source  : Element tag + content

  51. # Element.get_element_by_id(value)
  52. # Element.get_elements_by_tagname(value)
  53. # Element.get_elements_by_classname(value)
  54. # Element.get_elements_by_attribute(name=value)

  55. # You can also use shorter aliases (we prefer them):
  56. # Element.by_id(), by_tag(), by_class(), by_attr().

  57. # The tag name passed to Element.by_tag() can include
  58. # a class (e.g., "div.message") or an id (e.g., "div#header").

  59. # For example:
  60. # In the <head> tag, retrieve the <meta name="keywords"> element.
  61. # Get the string value of its "content" attribute and split into a list:
  62. dom = DOM(URL("http://www.clips.ua.ac.be").download())
  63. kw = dom.head.by_attr(name="keywords")[0]
  64. kw = kw.attrs["content"]
  65. kw = [x.strip() for x in kw.split(",")]
  66. print kw
  67. print

  68. # If you know CSS, you can also use short and handy CSS selectors:
  69. # http://www.w3.org/TR/CSS2/selector.html
  70. # Element(selector) will return a list of nested elements that match the given string.
  71. dom = DOM(URL("http://www.clips.ua.ac.be").download())
  72. for e in dom("div#sidebar-left li div:first-child span"):
  73.     print plaintext(e.content)
  74.     print
复制代码

18
hanszhu 发表于 2016-4-7 10:17:50

19
Nicolle 学生认证  发表于 2016-4-7 10:37:25
提示: 作者被禁止或删除 内容自动屏蔽

20
Nicolle 学生认证  发表于 2016-4-7 10:39:15
提示: 作者被禁止或删除 内容自动屏蔽

您需要登录后才可以回帖 登录 | 我要注册

本版微信群
加好友,备注jltj
拉您入交流群
GMT+8, 2026-1-3 01:14