- 阅读权限
- 255
- 威望
- 0 级
- 论坛币
- 50288 个
- 通用积分
- 83.6306
- 学术水平
- 253 点
- 热心指数
- 300 点
- 信用等级
- 208 点
- 经验
- 41518 点
- 帖子
- 3256
- 精华
- 14
- 在线时间
- 766 小时
- 注册时间
- 2006-5-4
- 最后登录
- 2022-11-6
|
- import os, sys; sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
- from pattern.web import Wiktionary, DOM
- from pattern.db import csv, pd
- # This example retrieves male and female given names from Wiktionary (http://en.wiktionary.org).
- # It then trains a classifier that can predict the gender of unknown names (about 78% correct).
- # The classifier is small (80KB) and fast.
- w = Wiktionary(language="en")
- f = csv() # csv() is a short alias for Datasheet().
- # Collect male and female given names from Wiktionary.
- # Store the data as (name, gender)-rows in a CSV-file.
- # The pd() function returns the parent directory of the current script,
- # so pd("given-names.csv") = pattern/examples/01-web/given-names.csv.
- for gender in ("male", "female"):
- for ch in ("abcdefghijklmnopqrstuvwxyz"):
- p = w.search("Appendix:%s_given_names/%s" % (gender.capitalize(), ch.capitalize()), cached=True)
- for name in p.links:
- if not name.startswith("Appendix:"):
- f.append((name, gender[0]))
- f.save(pd("given-names.csv"))
- print ch, gender
- # Create a classifier that predicts gender based on name.
- from pattern.vector import SVM, chngrams, count, kfoldcv
- class GenderByName(SVM):
- def train(self, name, gender=None):
- SVM.train(self, self.vector(name), gender)
- def classify(self, name):
- return SVM.classify(self, self.vector(name))
- def vector(self, name):
- """ Returns a dictionary with character bigrams and suffix.
- For example, "Felix" => {"Fe":1, "el":1, "li":1, "ix":1, "ix$":1, 5:1}
- """
- v = chngrams(name, n=2)
- v = count(v)
- v[name[-2:]+"$"] = 1
- v[len(name)] = 1
- return v
- data = csv(pd("given-names.csv"))
- # Test average (accuracy, precision, recall, F-score, standard deviation).
- print kfoldcv(GenderByName, data, folds=3) # (0.81, 0.79, 0.77, 0.78, 0.00)
- # Train and save the classifier in the current folder.
- # With final=True, discards the original training data (= smaller file).
- g = GenderByName(train=data)
- g.save(pd("gender-by-name.svm"), final=True)
- # Next time, we can simply load the trained classifier.
- # Keep in mind that the script that loads the classifier
- # must include the code for the GenderByName class description,
- # otherwise Python won't know how to load the data.
- g = GenderByName.load(pd("gender-by-name.svm"))
- for name in (
- "Felix",
- "Felicia",
- "Rover",
- "Kitty",
- "Legolas",
- "Arwen",
- "Jabba",
- "Leia",
- "Flash",
- "Barbarella"):
- print name, g.classify(name)
- # In the example above, Arwen and Jabba are misclassified.
- # We can of course improve the classifier by hand:
- #g.train("Arwen", gender="f")
- #g.train("Jabba", gender="m")
- #g.save(pd("gender-by-name.svm"), final=True)
- #print g.classify("Arwen")
- #print g.classify("Jabba")
复制代码
|
|