diff --git a/examples/intermediate_example/app/__init__.py b/examples/intermediate_example/app/__init__.py new file mode 100644 index 00000000..d9bb6891 --- /dev/null +++ b/examples/intermediate_example/app/__init__.py @@ -0,0 +1,19 @@ +from flask import Flask +from flask_script import Manager +#from flask.ext.sqlalchemy import SQLAlchemy +#from flask.ext.migrate import Migrate, MigrateCommand +from .commands import REPL +import os + +username,password = "eric_s","1234" +app = Flask(__name__) +#app.config["SQLALCHEMY_DATABASE_URI"] = os.getenv("DATABASE_URL") +#app.config["SQLALCHEMY_DATABASE_URI"] = "postgresql://"+username+":"+password+"@localhost/backpage_ads" +#db = SQLAlchemy(app) +#migrate = Migrate(app,db) + +#manager = Manager(app) +#manager.add_command('db', MigrateCommand) +#manager.add_command("shell",REPL()) + +from app import views #,models diff --git a/examples/intermediate_example/app/commands.py b/examples/intermediate_example/app/commands.py new file mode 100644 index 00000000..e68f2b57 --- /dev/null +++ b/examples/intermediate_example/app/commands.py @@ -0,0 +1,9 @@ +from flask_script import Command +import code + +class REPL(Command): + "runs the shell" + + def run(self): + code.interact(local=locals()) + diff --git a/examples/intermediate_example/app/models.py b/examples/intermediate_example/app/models.py new file mode 100644 index 00000000..cb275672 --- /dev/null +++ b/examples/intermediate_example/app/models.py @@ -0,0 +1,164 @@ +""" + +Here the models for our database is defined. + +I am using Postgres, Flask-SQLAlchemy for this application. + +For an introduction to Flask-SQLAlchemy check out: http://flask-sqlalchemy.pocoo.org/2.1/ +""" +from app import db + +class ImageToText(db.Model): + """ + This model stores the lookup for an image to text from Keras Models defined in image_processing.py + parameters: + @file_name - the filename being processed + @labels - the set of labels associated with the filename + @state - the state or province the ad appeared in + @city - the city or town the ad appeared in + @location - the location parsed from the ad + @url - the url of the ad + @timestamp - the timestamp of when the ad was scraped + @phone_number - the phone number associated with the ad + @latitude - latitude parsed from the ad + @longitude - longitude parsed from the ad + @image_url - image_url used for image lookup + """ + + __tablename__ = 'image_to_text' + id = db.Column(db.Integer, primary_key=True) + filename = db.Column(db.String) + labels = db.Column(db.String) + state = db.Column(db.String) + city = db.Column(db.String) + location = db.Column(db.String) + url = db.Column(db.String) + timestamp = db.Column(db.DateTime) + phone_number = db.Column(db.String) + latitude = db.Column(db.String) + longitude = db.Column(db.String) + image_url = db.Column(db.String) + throw_away = db.Column(db.String) + + def __init__( + self, image_url, filename, labels, state, city, + location, url, timestamp, phone_number, + latitude, longitude,throw_away + ): + self.image_url = image_url + self.filename = filename + self.labels = labels + self.state = state + self.city = city + self.location = location + self.url = url + self.timestamp = timestamp + self.phone_number = phone_number + self.latitude = latitude + self.longitude = longitude + self.throw_away = throw_away + + +class AreaCodeLookup(db.Model): + """ + This model provides a look up for phone number area codes and aids in converting them to latitude, longitude. + Specifically this mapping provides: + Area code and it's corresponding township. + From there geopy provides the lookup to latitude, longitude + + Because location may not be unique - there could be multiple towns with the same name, + there is not a 100% guarantee all lookups will be accurate. + + Source: https://www.allareacodes.com/ + parameters: + @area_code - the area code from a phone number + @city - a string city + @state - a string state + @latitude - latitude for the area code + @longitude - longitude for the area code + """ + __tablename__ = "areacode_lookup" + id = db.Column(db.Integer, primary_key=True) + area_code = db.Column(db.String) + city = db.Column(db.String) + state = db.Column(db.String) + latitude = db.Column(db.String) + longitude = db.Column(db.String) + + def __init__(self, area_code, city, state, latitude, longitude): + self.area_code = area_code + self.city = city + self.state = state + self.latitude = latitude + self.longitude = longitude + + +class BackpageAdInfo(db.Model): + """ + This model gives us a set of specific information from each add scraped from backpage. + + parameters: + @ad_title - used primarily to uniquely identify backpage ads - since titles are unique + @phone_number - the phone number used in the ad, can be empty. This number is stored as a string + since it should be thought of as immutable. + @city - the city the add is from + @state - the state the add is from + @location - the location mentioned in the advertisement + @latitude - latitude derived from the location mentioned in the advertisement + @longitude - longitude derived from the location mentioned in the advertisement + @ad_body - the long form text in the ad + @photos - a filepath link to the set of pictures downloaded for the ad + @post_id - an id for each backpage post from backpage + @timestamp - when the ad was scraped + @url - the url of the scraped ad + """ + __tablename__ = 'ad_info' + id = db.Column(db.Integer, primary_key=True) + ad_title = db.Column(db.String) + phone_number = db.Column(db.String) + location = db.Column(db.String) + latitude = db.Column(db.String) + longitude = db.Column(db.String) + ad_body = db.Column(db.String) + photos = db.Column(db.String) + post_id = db.Column(db.String) + timestamp = db.Column(db.DateTime) + city = db.Column(db.String) + state = db.Column(db.String) + url = db.Column(db.String) + + def __init__(self,url, ad_title, phone_number, ad_body, location, latitude, longitude, photos, post_id,timestamp, city, state): + self.url = url + self.ad_title = ad_title + self.phone_number = phone_number + self.location = location + self.latitude = latitude + self.longitude = longitude + self.ad_body = ad_body + self.photos = photos + self.post_id = post_id + self.timestamp = timestamp + self.city = city + self.state = state + + +class Backpage(db.Model): + """ + This model gives us high level information about backpage, the website. + It is used to determine some metrics found in lectures/scraping_the_web.md + + parameters: + @timestamp - this is the time at which the content was scraped, it is assumed scrapers will run all the time, + therefore the scrape time should be accurate to within an hour of scraping, this is used in some of the metrics + for analysis. + @frequency - this is the number of ads scraped at @timestamp and is used in many of the metrics for the scraper. + """ + __tablename__ = 'backpage' + id = db.Column(db.Integer, primary_key=True) + timestamp = db.Column(db.DateTime) + frequency = db.Column(db.Integer) + + def __init__(self,timestamp,frequency): + self.timestamp = timestamp + self.frequency = frequency + diff --git a/examples/intermediate_example/app/static/js/main-2.js b/examples/intermediate_example/app/static/js/main-2.js new file mode 100644 index 00000000..927feaa3 --- /dev/null +++ b/examples/intermediate_example/app/static/js/main-2.js @@ -0,0 +1,48 @@ +(() => { +'use strict'; + +function buildGraph(response) { + const cop = response.cop; + const cops = response.cops; + const timestamp = response.timestamp; + + + var chart = c3.generate({ + data: { + x: 'x', + columns: [ + timestamp, + cop, + cops + ] + }, + axis: { + x: { + type: 'timeseries', + tick: { + format: '%Y-%m-%d' + } + } + } + }); +} + +function getArticles() { + //This returns JSON + const resp = axios.get('http://localhost:5000/api') + .then(function (response) { + console.log(response); + }) + .catch(function (error) { + console.log(error); + }); + + // parse + // return parsed response +} + + +buildGraph(getArticles()); + +})(); + diff --git a/examples/intermediate_example/app/static/js/main.js b/examples/intermediate_example/app/static/js/main.js new file mode 100644 index 00000000..56a77d9f --- /dev/null +++ b/examples/intermediate_example/app/static/js/main.js @@ -0,0 +1,46 @@ + +(() => { + +function buildGraph() { +'use strict'; + + var chart = c3.generate({ + data: { + x: 'x', + // xFormat: '%Y%m%d', // 'xFormat' can be used as custom format of 'x' + columns: [ + ['x', '2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04', '2013-01-05', '2013-01-06'], + // ['x', '20130101', '20130102', '20130103', '20130104', '20130105', '20130106'], + ['data1', 30, 200, 100, 400, 150, 250], + ['data2', 130, 340, 200, 500, 250, 350] + ] + }, + axis: { + x: { + type: 'timeseries', + tick: { + format: '%Y-%m-%d' + } + } + } + }); +} + +function getArticles() { + const resp = axios.get('') + .then(function (response) { + console.log(response); + }) + .catch(function (error) { + console.log(error); + }); + + // parse + // return parsed response +} + + +buildGraph(getArticles()); + +})(); + diff --git a/examples/intermediate_example/app/templates/index.html b/examples/intermediate_example/app/templates/index.html new file mode 100644 index 00000000..db21d5dc --- /dev/null +++ b/examples/intermediate_example/app/templates/index.html @@ -0,0 +1,15 @@ + + +
+ + + + + + + + + + + + diff --git a/examples/intermediate_example/app/views.py b/examples/intermediate_example/app/views.py new file mode 100644 index 00000000..14ec40af --- /dev/null +++ b/examples/intermediate_example/app/views.py @@ -0,0 +1,70 @@ +from app import app +#from app import db +from flask import render_template, request, jsonify +import json +import requests +import pandas as pd +from datetime import datetime + +def xor(v1, v2): + if v1 and v2: + return False + if v1 and not(v2): + return True + if not(v1) and v2: + return True + if not(v1) and not(v2): + return False + +def found(val): + if val == -1: + return False + else: + return True + +def fetch_news(): + API_KEY = 'e750e0189ede4b6b8b1a766b8523b29a' + + resp = requests.get('https://newsapi.org/v1/articles?source=techcrunch&apiKey=' + API_KEY) + resp2 = requests.get('https://newsapi.org/v1/articles?source=reuters&apiKey=' + API_KEY) + resp3 = requests.get('https://newsapi.org/v1/articles?source=newsweek&apiKey=' + API_KEY) + resp4 = requests.get('https://newsapi.org/v1/articles?source=new-york-times&apiKey=' + API_KEY) + resp5 = requests.get('https://newsapi.org/v1/articles?source=the-wall-street-journal&apiKey=' + API_KEY) + resp6 = requests.get('https://newsapi.org/v1/articles?source=the-washington-post&apiKey=' + API_KEY) + + list_of_words =['cop', 'cops', 'crime', 'law enforcement', 'homocide', 'crime rate', 'white collar crime', 'blue collar crime'] + + empty_set = set() + mention_count = {}.fromkeys(list_of_words, 0) + link_mentions = {}.fromkeys(list_of_words, empty_set) + + entries = resp.json() + entries.update(resp2.json()) + entries.update(resp3.json()) + entries.update(resp4.json()) + entries.update(resp5.json()) + entries.update(resp6.json()) + + for article in entries['articles']: + for word in list_of_words: + title_found = found(article['title'].find(word)) + description_found = found(article['description'].find(word)) + if xor(title_found, description_found): + mention_count[word] += 1 + link_mentions[word].add(article['url']) + + link_mentions = {key:list(link_mentions[key]) for key in link_mentions} + mention_count["timestamp"] = str(datetime.now()) + link_mentions["timestamp"] = str(datetime.now()) + return mention_count, link_mentions + + +@app.route("/api", methods=["GET", "POST"]) +def api(): + mention_count, link_mentions = fetch_news() + return jsonify(mention_count) + + +@app.route("/", methods=["GET", "POST"]) +def index(): + return render_template("index.html") diff --git a/examples/intermediate_example/manage.py b/examples/intermediate_example/manage.py new file mode 100644 index 00000000..e21de782 --- /dev/null +++ b/examples/intermediate_example/manage.py @@ -0,0 +1,3 @@ +from app import manager + +manager.run() diff --git a/examples/intermediate_example/requirements.txt b/examples/intermediate_example/requirements.txt new file mode 100644 index 00000000..be7e2759 --- /dev/null +++ b/examples/intermediate_example/requirements.txt @@ -0,0 +1,30 @@ +boto +keras +elasticsearch +zipcode +gunicorn +geopy +usaddress +statsmodels +scipy +Flask +Flask-Cors +Flask-Migrate +Flask-Script +Flask-SQLAlchemy +Jinja2 +lxml +matplotlib +nose +num2words +pandas +patsy +pbr +plotly +psycopg2 +requests +SQLAlchemy +Werkzeug +twilio +geopandas +shapely diff --git a/examples/intermediate_example/run_server.py b/examples/intermediate_example/run_server.py new file mode 100644 index 00000000..9c57ff41 --- /dev/null +++ b/examples/intermediate_example/run_server.py @@ -0,0 +1,3 @@ +from app import app + +app.run(debug=True) diff --git a/examples/intermediate_example/start_db.sh b/examples/intermediate_example/start_db.sh new file mode 100755 index 00000000..293063e9 --- /dev/null +++ b/examples/intermediate_example/start_db.sh @@ -0,0 +1 @@ +pg_ctl -D /usr/local/var/postgres -l /usr/local/var/postgres/server.log start