forked from miker123/Python-Web-Scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbeautifulSoupExample.py
44 lines (40 loc) · 1.27 KB
/
beautifulSoupExample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#!/usr/env/python
from bs4 import BeautifulSoup
import urllib
#Mike R
#BeautifulSoup example
html=urllib.urlopen("http://espn.go.com")
print html
bt=BeautifulSoup(html.read(), "lxml") #read, parser wanted. By default uses HTML which is not the best available.
print(bt) #give output on entire output of the document
#print title of document. BS has tag options
print(bt.title) #In HTML, there is only 1 title. SO this is it.
print(bt.title.string)
#navigable string. Getting the actual string inside
#default Beautiful Soup returns unicode strings!!!
print bt.title.name #exact tag
#get meta tag
print bt.meta
#print next meta tag
print bt.meta.next
print bt.meta.next.next
#find method to search for tag
#find for singular return. return all matches find_all
allMetaTags = bt.find_all('meta')
print allMetaTags
#returns list of all meta tags
#first element of list
print allMetaTags[0]
#access as dictionary values
print allMetaTags[0]['content']
#print allMetaTags[0]['http-equiv']
#find_all take in regular expression. CSS selector searching and other functions
allMetaTags=bt.find_all('meta')
#find links in current HTML file
allLinks = bt.find_all('a')
#a tags and print href values
print len(allLinks)
for link in allLinks:
print link['href']
#print relative and full links.
bt.get_text()