1,两个小时学会perl
http://qntm.org/files/perl/perl.html
2,Python整站爬虫(Demo)(依赖Mysql)
本帖隐藏的内容
- #!/usr/bin/python
- # vim: set fileencoding=utf-8:
- import sys
- import urllib2
- import re
- from BeautifulSoup import BeautifulSoup
- import ConfigParser
- import MySQLdb as mdb
- class Db_Connector:
- def __init__(self, config_file_path):
- cf = ConfigParser.ConfigParser()
- cf.read(config_file_path)
- db_host = cf.get("mysql_db", "host")
- db_port = cf.getint("mysql_db", "port")
- db_user = cf.get("mysql_db", "username")
- db_pwd = cf.get("mysql_db", "password")
- db_data=cf.get("mysql_db","db_name")
- try:
- self.con=mdb.connect(db_host,db_user,db_pwd,db_data)
- self.cur=self.con.cursor()
- except:
- print "[*] DB Connect Error"
- def find_all(self,sql_script):
- try:
- self.cur.execute(sql_script)
- return self.cur.fetchall()
- except:
- print "[*] DB FindAll Error"
- def find_item(self,sql_script):
- try:
- self.cur.execute(sql_script)
- return self.cur.fetchone()
- except:
- print "[*] DB FindItem Error"
- def insert_item(self,sql_script):
- try:
- self.cur.execute(sql_script)
- self.con.commit()
- return True
- except Exception, e:
- print '[*] DB Insert Into Error'
- def update_item(self,sql_script):
- try:
- self.cur.execute(sql_script)
- self.con.commit()
- return True
- except Exception, e:
- print "[*] DB Update Error"
- class SpriderUrl:
- # 初始化
- def __init__(self,url):
- self.url=url
- self.con=Db_Connector('sprider.ini')
- #获得目标url的第一次url清单
- def get_self(self):
- urls=[]
- try:
- body_text=urllib2.urlopen(self.url).read()
- except:
- print "[*] Web Get Error:checking the Url"
- sys.exit(0)
- soup=BeautifulSoup(body_text)
- links=soup.findAll('a')
- for link in links:
- # 获得了目标的url但还需要处理
- _url=link.get('href')
- # 接着对其进行判断处理
- # 先判断它是否是无意义字符开头以及是否为None值
- # 判断URL后缀,不是列表的不抓取
- if re.match('^(javascript|:;|#)',_url) or _url is None or re.match('.(jpg|png|bmp|mp3|wma|wmv|gz|zip|rar|iso|pdf|txt|db)
- [/hide][b][color=#8b0000]
- [/color][/b][b][size=5][color=#8b0000]3,JAVA[/color][/size][size=5][color=#8b0000]构造器的实际使用[/color][/size][/b][font=华文楷体][size=5][color=#8b0000]主要问题是在复习PYTHON的面对对象,所以看看JAVA找灵感。[/color][/size][/font]
- [hide][code]
- package basic.day12;
- public class User {
- String name;
- String pwd;
- int age;
-
-
- public User(String name,String pwd,int age){
- this.name=name;
- this.pwd=pwd;
- this.age=age;
- }
- public String toString(){
- return name+":"+age;
- }
- }
3,JAVA构造器的实际使用package basic.day12;
public class User {
String name;
String pwd;
int age;
public User(String name,String pwd,int age){
this.name=name;
this.pwd=pwd;
this.age=age;
}
public String toString(){
return name+":"+age;
}
}
4,Perl爬虫动手(抓首页)
下面的代码目前只实现了抓取首页,后面会改下,要求抓2级到3级页面
#!/usr/bin/perl
use HTML::LinkExtor;
use LWP::Simple;
sub sparit{
local($base_url)= shift;
# 抓取URL
$parser = HTML::LinkExtor->new(undef,$base_url);
$parser->parse(get($base_url))->eof;
@links = $parser->links;
foreach $linkarray (@links){
my @element = @$linkarray;
my $elt_type = shift @element;
while( @element){
my ($attr_name , $attr_value) = splice(@element, 0, 2);
$seen{$attr_value}++;
}
}
# 排序/去除非同站URL
for (sort keys %seen){
if($_=~m#^$base_url#){
push(@tmp_url,"$_\n");
}
}
# 返回匹配的URL
return @tmp_url;
}
5,Python字符串处理例子
#!/usr/bin/python
#coding=gb2312
line='aa,bb,cc,dd';
print line.split(',');
w='start';
s='wic\n';
print w.upper();
print w.isalpha();
print s.rstrip();
print '%s,caonima,de,%s' %('wo','b');
print '{0},caonima,de,{1}'.format('cao', 'nidaye');



雷达卡



学习学习
京公网安备 11010802022788号







