aoaopython-爬虫网页上邮箱,wiki下
aoaopython-爬虫网页上邮箱
import requests
import re
from bs4 import BeautifulSoup
import chardet
url = 'http://week.wiki/posts/f1fb5980d43c/'
response = requests.get(url)
# 自动识别编码并解码网页内容为Unicode字符串
encoding = chardet.detect(response.content)['encoding']
html = response.content.decode(encoding)
# 使用BeautifulSoup解析网页内容
soup = BeautifulSoup(html, 'html.parser')
# 正则表达式匹配电子邮件地址
email_regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
emails = set() # 使用集合去重
for tag in soup.find_all('a', href=True):
if 'mailto:' in tag['href']:
email = re.search(email_regex, tag['href'])
if email:
emails.add(email.group(0))
# 输出所有找到的电子邮件地址
for email in emails:
print(email)
脚本2
import sys
import urllib
to_search_list = []
searched_list = []
email_list = []
email_list_withurl = []
def download(max_iterations):
iteration = 1
while iteration <= int(max_iterations):
print
"Searching for emails..."
## We reached a dead end if there are no
## more sites in our to_search_list
if len(to_search_list) == 0:
print
"Dead URL end"
break
## Get the first URL from the list of the URLs
## we need to crawl over, and place it in the list
## of URLs that we already crawled.
first_url = to_search_list[0]
to_search_list.remove(first_url)
searched_list.append(first_url)
## A simple function using urllib to download
## a URL
def download_url(url):
return urllib.urlopen(url)
## Try to download the URL. In case of an error,
## forget about it and move on to the next URL.
try:
content = download_url(first_url)
except:
try:
content = download_url(first_url)
except:
iteration += 1
continue
for line in content:
## Find more URLs
import re
## The regular expression we will use to search for URLs:
url_expression = r"http://+[\w\d:#@%/;$()~_?\+-=\\\.&]*"
regex = re.compile(url_expression)
## Find all the URLs and
results = regex.findall(line)
if results:
for result in results:
## If the URL is new, add it to the list
## of URLs we need to crawl over.
if result not in searched_list:
to_search_list.append(result)
## Find email addresses
## The regular expression we will use to search for email
## addresses. For more information on this, have a look at
## the "validating-email" example.
email_expression = r"\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,6}"
eregex = re.compile(email_expression)
## Find all the email addresses
e_results = eregex.findall(line)
if e_results:
for e_result in e_results:
## If the email address is new, add it to
## our email list.
if e_result not in email_list:
email_list.append(e_result)
email_list_withurl.append(first_url + ', ' + e_result)
iteration += 1
def output_results():
## This function will print the following information:
## number of sites in our sites to crawl list, the number
## of sites we actually crawled, and the total number of
## emails collected.
print
"Number of sites to search: %s" % len(to_search_list)
print
"Number of sites searched: %s" % len(searched_list)
print
"Number of emails collected: %s" % len(email_list)
def write_results():
## Write all the information that the
## output_results() function prints out (see above)
## into a file called "info.txt"
info_file_name = "info.txt"
i = open("info.txt", "w") ## create the file
i.write("Number of sites to search: %s \n" % len(to_search_list))
i.write("Number of sites searched: %s \n" % len(searched_list))
i.write("Number of emails collected: %s \n" % len(email_list))
i.close()
## Write down all the emails collected into a file called
## "email_addresses.txt". We will use this file in the next
## part of this example.
file_name = "email_addresses.txt"
n = open(file_name, "w")
for email in email_list:
entry = email + "\n"
n.write(entry)
n.close()
file_name = "email_addresses_withurl.txt"
n = open(file_name, "w")
for email in email_list_withurl:
entry = email + "\n"
n.write(entry)
n.close()
def get_input():
## Gather input from the user using sys.argv
try:
filename = sys.argv[1]
except:
raise Exception("\n\nSorry, invalid input. Please enter one arguments: the website URL.\n")
return filename
def main():
filename = get_input()
with open(filename) as f:
content = f.readlines()
for i in content:
urltosearch = i.rstrip('\n').lstrip(' ')
to_search_list.append(urltosearch)
iterations = len(to_search_list)
download(iterations)
output_results()
write_results()
if __name__ == "__main__":
main()