Python Web Scraping

From rbachwiki
Revision as of 16:20, 1 September 2020 by Bacchas (talk | contribs)
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to navigation Jump to search

Web Scraping

# pip install bs4, requests, pandas
# install them one at a time
import requests
from bs4 import BeautifulSoup
import pandas as pd
nj = 'https://forecast.weather.gov/MapClick.php?lat=40.89165000000003&lon=-74.04688499999997#.XgvA5xdKhUQ'
alaska = 'https://forecast.weather.gov/MapClick.php?lat=64.0003&lon=-150.0003#.XgvO1BdKhUQ'
page = requests.get(alaska)
soup = BeautifulSoup(page.content, 'html.parser')
# print(soup.find_all('a')) # find all a tags
week = soup.find(id='seven-day-forecast-body')
items = (week.find_all(class_='tombstone-container'))
#print(items[0])

item1 = items[0].find(class_='period-name').get_text()
item2 = items[0].find(class_='short-desc').get_text()
item3 = items[0].find(class_='temp').get_text()
#print('Weather: ' + item1 + ' - ' + item2 + '  - ' + item3)

period_names = [item.find(class_='period-name').get_text() for item in items]
short_desc = [item.find(class_='short-desc').get_text() for item in items]
temp = [item.find(class_='temp').get_text() for item in items]
#print(period_names)
#print(short_desc)
#print(temp)

weather_stuff = pd.DataFrame(
  {
    'period': period_names,
    'short_description': short_desc,
    'temperatures': temp
  }
)

print(weather_stuff)
weather_stuff.to_csv('alaska.csv')
weather_stuff.to_html('alaska.html')


# pip install bs4, requests, pandas, lxml
# install them one at a time
import requests
from bs4 import BeautifulSoup
import pandas as pd
import lxml
import csv
import sys

reload(sys)
sys.setdefaultencoding('utf8')

source = requests.get('http://coreyms.com').text

soup = BeautifulSoup(source, 'lxml')


csv_file = open('web_scrape.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['headline', 'summary', 'vidsource'])

for article in soup.find_all('article'):
    headline = article.h2.a.text
    summary = article.find('div', class_='entry-content').p.text
    #get the value of the source
    try:
        vidsource = article.find('iframe')['src']
    except TypeError:
        vidsource= '* No video'

    #parsing out part of a string
    # eg: http://youtube.com/embed/12345-7o?version=3&rel=1&fs
    #vid_id = vidsource.split('/)[4]
    # this would split the url using the / and the 4th split is the utube id
    # vid_id = vid_id.split('?')[0]
    # put the link together, the "f" means formatted: youtube prefix url is always the same
    # yt_link = f'https://youtube.com/watch?v={vid_id}'
    #print(article.prettify())
    print(headline)
    print(summary)
    print(vidsource)
    print('--------------------------')
    csv_writer.writerow([headline, summary, vidsource])
csv_file.close()



Back To Top - Category