You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

164 lines
6.0 KiB

"""
Here the models for our database is defined.
I am using Postgres, Flask-SQLAlchemy for this application.
For an introduction to Flask-SQLAlchemy check out: http://flask-sqlalchemy.pocoo.org/2.1/
"""
from app import db
class ImageToText(db.Model):
"""
This model stores the lookup for an image to text from Keras Models defined in image_processing.py
parameters:
@file_name - the filename being processed
@labels - the set of labels associated with the filename
@state - the state or province the ad appeared in
@city - the city or town the ad appeared in
@location - the location parsed from the ad
@url - the url of the ad
@timestamp - the timestamp of when the ad was scraped
@phone_number - the phone number associated with the ad
@latitude - latitude parsed from the ad
@longitude - longitude parsed from the ad
@image_url - image_url used for image lookup
"""
__tablename__ = 'image_to_text'
id = db.Column(db.Integer, primary_key=True)
filename = db.Column(db.String)
labels = db.Column(db.String)
state = db.Column(db.String)
city = db.Column(db.String)
location = db.Column(db.String)
url = db.Column(db.String)
timestamp = db.Column(db.DateTime)
phone_number = db.Column(db.String)
latitude = db.Column(db.String)
longitude = db.Column(db.String)
image_url = db.Column(db.String)
throw_away = db.Column(db.String)
def __init__(
self, image_url, filename, labels, state, city,
location, url, timestamp, phone_number,
latitude, longitude,throw_away
):
self.image_url = image_url
self.filename = filename
self.labels = labels
self.state = state
self.city = city
self.location = location
self.url = url
self.timestamp = timestamp
self.phone_number = phone_number
self.latitude = latitude
self.longitude = longitude
self.throw_away = throw_away
class AreaCodeLookup(db.Model):
"""
This model provides a look up for phone number area codes and aids in converting them to latitude, longitude.
Specifically this mapping provides:
Area code and it's corresponding township.
From there geopy provides the lookup to latitude, longitude
Because location may not be unique - there could be multiple towns with the same name,
there is not a 100% guarantee all lookups will be accurate.
Source: https://www.allareacodes.com/
parameters:
@area_code - the area code from a phone number
@city - a string city
@state - a string state
@latitude - latitude for the area code
@longitude - longitude for the area code
"""
__tablename__ = "areacode_lookup"
id = db.Column(db.Integer, primary_key=True)
area_code = db.Column(db.String)
city = db.Column(db.String)
state = db.Column(db.String)
latitude = db.Column(db.String)
longitude = db.Column(db.String)
def __init__(self, area_code, city, state, latitude, longitude):
self.area_code = area_code
self.city = city
self.state = state
self.latitude = latitude
self.longitude = longitude
class BackpageAdInfo(db.Model):
"""
This model gives us a set of specific information from each add scraped from backpage.
parameters:
@ad_title - used primarily to uniquely identify backpage ads - since titles are unique
@phone_number - the phone number used in the ad, can be empty. This number is stored as a string
since it should be thought of as immutable.
@city - the city the add is from
@state - the state the add is from
@location - the location mentioned in the advertisement
@latitude - latitude derived from the location mentioned in the advertisement
@longitude - longitude derived from the location mentioned in the advertisement
@ad_body - the long form text in the ad
@photos - a filepath link to the set of pictures downloaded for the ad
@post_id - an id for each backpage post from backpage
@timestamp - when the ad was scraped
@url - the url of the scraped ad
"""
__tablename__ = 'ad_info'
id = db.Column(db.Integer, primary_key=True)
ad_title = db.Column(db.String)
phone_number = db.Column(db.String)
location = db.Column(db.String)
latitude = db.Column(db.String)
longitude = db.Column(db.String)
ad_body = db.Column(db.String)
photos = db.Column(db.String)
post_id = db.Column(db.String)
timestamp = db.Column(db.DateTime)
city = db.Column(db.String)
state = db.Column(db.String)
url = db.Column(db.String)
def __init__(self,url, ad_title, phone_number, ad_body, location, latitude, longitude, photos, post_id,timestamp, city, state):
self.url = url
self.ad_title = ad_title
self.phone_number = phone_number
self.location = location
self.latitude = latitude
self.longitude = longitude
self.ad_body = ad_body
self.photos = photos
self.post_id = post_id
self.timestamp = timestamp
self.city = city
self.state = state
class Backpage(db.Model):
"""
This model gives us high level information about backpage, the website.
It is used to determine some metrics found in lectures/scraping_the_web.md
parameters:
@timestamp - this is the time at which the content was scraped, it is assumed scrapers will run all the time,
therefore the scrape time should be accurate to within an hour of scraping, this is used in some of the metrics
for analysis.
@frequency - this is the number of ads scraped at @timestamp and is used in many of the metrics for the scraper.
"""
__tablename__ = 'backpage'
id = db.Column(db.Integer, primary_key=True)
timestamp = db.Column(db.DateTime)
frequency = db.Column(db.Integer)
def __init__(self,timestamp,frequency):
self.timestamp = timestamp
self.frequency = frequency