Difference between revisions of "Python Web Scraping"
Jump to navigation
Jump to search
(Created page with "==Web Scraping== <pre> # pip install bs4, requests, pandas # install them one at a time import requests from bs4 import BeautifulSoup import pandas as pd nj = 'https://forecas...") |
|||
| (One intermediate revision by the same user not shown) | |||
| Line 91: | Line 91: | ||
</pre> | </pre> | ||
==[[#top|Back To Top]] - [[Python|Category]]== | |||
[[Category:Python]] | |||
Latest revision as of 16:20, 1 September 2020
Web Scraping
# pip install bs4, requests, pandas
# install them one at a time
import requests
from bs4 import BeautifulSoup
import pandas as pd
nj = 'https://forecast.weather.gov/MapClick.php?lat=40.89165000000003&lon=-74.04688499999997#.XgvA5xdKhUQ'
alaska = 'https://forecast.weather.gov/MapClick.php?lat=64.0003&lon=-150.0003#.XgvO1BdKhUQ'
page = requests.get(alaska)
soup = BeautifulSoup(page.content, 'html.parser')
# print(soup.find_all('a')) # find all a tags
week = soup.find(id='seven-day-forecast-body')
items = (week.find_all(class_='tombstone-container'))
#print(items[0])
item1 = items[0].find(class_='period-name').get_text()
item2 = items[0].find(class_='short-desc').get_text()
item3 = items[0].find(class_='temp').get_text()
#print('Weather: ' + item1 + ' - ' + item2 + ' - ' + item3)
period_names = [item.find(class_='period-name').get_text() for item in items]
short_desc = [item.find(class_='short-desc').get_text() for item in items]
temp = [item.find(class_='temp').get_text() for item in items]
#print(period_names)
#print(short_desc)
#print(temp)
weather_stuff = pd.DataFrame(
{
'period': period_names,
'short_description': short_desc,
'temperatures': temp
}
)
print(weather_stuff)
weather_stuff.to_csv('alaska.csv')
weather_stuff.to_html('alaska.html')
# pip install bs4, requests, pandas, lxml
# install them one at a time
import requests
from bs4 import BeautifulSoup
import pandas as pd
import lxml
import csv
import sys
reload(sys)
sys.setdefaultencoding('utf8')
source = requests.get('http://coreyms.com').text
soup = BeautifulSoup(source, 'lxml')
csv_file = open('web_scrape.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['headline', 'summary', 'vidsource'])
for article in soup.find_all('article'):
headline = article.h2.a.text
summary = article.find('div', class_='entry-content').p.text
#get the value of the source
try:
vidsource = article.find('iframe')['src']
except TypeError:
vidsource= '* No video'
#parsing out part of a string
# eg: http://youtube.com/embed/12345-7o?version=3&rel=1&fs
#vid_id = vidsource.split('/)[4]
# this would split the url using the / and the 4th split is the utube id
# vid_id = vid_id.split('?')[0]
# put the link together, the "f" means formatted: youtube prefix url is always the same
# yt_link = f'https://youtube.com/watch?v={vid_id}'
#print(article.prettify())
print(headline)
print(summary)
print(vidsource)
print('--------------------------')
csv_writer.writerow([headline, summary, vidsource])
csv_file.close()