程序员人生 网站导航

Learn Web.Crawling of Perl

栏目:php教程时间:2015-03-11 08:03:31
##### #Overview of Web.Crawling related modules. #Note that, below codes can not be executed just for overview intention. ##### #!/usr/bin/perl ##### #HTTP::Thin ##### use 5.12.1; use HTTP::Request::Common; use HTTP::Thin; say HTTP::Thin->new()->request(GET 'http://example.com')->as_string; ##### #HTTP:Tiny ##### use HTTP::Tiny; my $response = HTTP::Tiny->new->get('http://example.com/'); die "Failed! " unless $response->{success}; print "$response->{status} $response->{reason} "; while (my ($k, $v) = each %{$response->{headers}}) { for (ref $v eq 'ARRAY' ? @$v : $v) { print "$k: $_ "; } } print $response->{content} if length $response->{content}; #new $http = HTTP::Tiny->new{ %attrubutes }; #valid attributes include: #-agent #-cookie_jar #-default_headers #-local_address #-keep_alive #-max_redirect #-max_size #-https_proxy #-proxy #-no_proxy #-timeout #-verify_SSL #-SSL_options #get[head][put][post]delete $response = $http->get($url); $response = $http->get($url, \%options); $response = $http->head($url); #post_form $response = $http->post_form($url, $form_data); $response = $http->post_form($url, $form_data, \%options); #request $response = $http->request($method, $url); $response = $http->request($method, $url, \%options); $http->request('GET', 'http://user:pwd hk.mars@aol.com'); #or $http->request('GET', 'http://mars%40:pwd hk.mars@aol.com'); #www_form_urlencode $params = $http->www_form_urlencode( $data ); $response = $http->get("http://example.com/query?$params"); #SSL support SSL_options => { SSL_ca_file => $file_path, } #proxy support ##### #www::Mechanize # #Stateful programmatic web browsing, used for automating interaction with websites. ##### use WWW::Mechanize; my $mech = WWW::Mechanize->new(); $mech->get( $url ); $mech->follow_link( n => 3 ); $mech->follow_link( text_regex => qr/download this/i ); $mech->follow_link( url => 'http://host.com/index.html' ); $mech->submit_form( form_number => 3, fields => { username => 'banana', passoword => 'lost-and-alone', } ); $mech->submit_form( form_name => 'search', fields => { query => 'pot of gold', }, button => 'search now' ); #testing web applications use Test::More; like( $mech->content(), qr/$expected/, "Got expected content" ); #page traverse $mech->back(); #finer control over page $mech->find_link( n => $number ); $mech->form_number( $number ); $mech->form_name( $name ); $mech->field( $name, $value ); $mech->set_fields( $field_values ); $mech->set_visible( @criteria ); $mech->click( $button ); #subclass of LWP::UserAgent, eg: $mech->add_header( $name =>$value ); #page-fecting methods #status methods #content-handling methods #link methods #image methods #form methods #field methods #miscellaneous methods #overridden LWP::UserAgent methods #inherited unchanced LWP::UserAgent methods #yeah now, it's easy to implement a spider project for future integration use.
Mars
------分隔线----------------------------
------分隔线----------------------------

最新技术推荐