88 lines
4.9 KiB
Python
88 lines
4.9 KiB
Python
'''
|
||
Dependencies: Python 3, NLTK, Flask. The last two can be installed from Python's PIP.
|
||
|
||
Bibliography:
|
||
[1] Perkins, Jacob. Text Classification for Sentiment Analysis – Naive Bayes Classifier, 2010. Retrieved June 30, 2015 from http://streamhacker.com/2010/05/10/text-classification-sentiment-analysis-naive-bayes-classifier/ .
|
||
[2] Perkins, Jacob. Python 3 Text Processing with NLTK 3 Cookbook, 2nd edition. Packt Publishing, Birmingham, UK, 2014.
|
||
[3] NLTK Project nltk.tokenize package, 2015 Retrieved June 30, 2015 from http://www.nltk.org/api/nltk.tokenize.html .
|
||
'''
|
||
|
||
# Assorted imports required below.
|
||
from nltk.corpus.reader import CategorizedPlaintextCorpusReader as corpusreader
|
||
from nltk.classify import util, NaiveBayesClassifier
|
||
from nltk.tokenize import RegexpTokenizer as tokenizer
|
||
from math import floor
|
||
from flask import Flask, request, url_for, render_template
|
||
|
||
# def sentimentprepare():
|
||
# The "definition" below runs once when the program starts, and defines the corpus and the classifier.
|
||
# trainingpercentage = 80 # % - Set at will here.
|
||
# Files are in the format ./pos/file.txt or ./neg/file.txt
|
||
# The categorized corpus reader imports the files and assigns them to categories, depending on their original folder.
|
||
sentimentcorpus = corpusreader('.', r'(pos|neg)/.*\.txt', cat_pattern=r'(pos|neg)')
|
||
# The list of files corresponding to each category is saved in the variables below.
|
||
posids = sentimentcorpus.fileids('pos')
|
||
negids = sentimentcorpus.fileids('neg')
|
||
# Features are based in a bag-of-words model, the contents of all files of each category are imported into a dictionary each. All items are lower-cased for normalization.
|
||
posfeatures = [ ( dict( [(word.lower(), True) for word in (sentimentcorpus.words( fileids=[i] )) ] ), 'pos' ) for i in posids]
|
||
negfeatures = [ ( dict( [(word.lower(), True) for word in (sentimentcorpus.words( fileids=[i] )) ] ), 'neg' ) for i in negids]
|
||
# This is what the training percentage is for: the first N% of elements are taken for training the classifier, the last 100-N% are taken for testing the accuracy.
|
||
# trainingfeatures = negfeatures[:floor( len(negfeatures)*trainingpercentage/100 )] + posfeatures[:floor( len(posfeatures)*trainingpercentage/100 )]
|
||
trainingfeatures = negfeatures + posfeatures
|
||
# testingfeatures = negfeatures[ floor( len(negfeatures)*trainingpercentage/100):] + posfeatures[ floor( len(posfeatures)*trainingpercentage/100):]
|
||
# Take the classifier, and train it with the training subset.
|
||
classifier = NaiveBayesClassifier.train(trainingfeatures)
|
||
# Take the classifier, and verify it against the testing subset.
|
||
# accuracy = util.accuracy(classifier, testingfeatures)
|
||
|
||
def sentimentverify(input):
|
||
# The code below allows to convert a plain-text review into the format used by the NLTK corpus reader.
|
||
tokenizer('\w+|\$[\d\.]+|\S+').tokenize(input.lower())
|
||
# Create a bag-of-words dictionary for the particular review.
|
||
wordict = dict( [(word, True) for word in input ] )
|
||
# This does the heavy lifting of checking if the review is positive or negative.
|
||
classification = classifier.classify(wordict)
|
||
# Return the probability of the review of being positive or negative.
|
||
posprob = classifier.prob_classify(wordict).prob('pos')
|
||
negprob = classifier.prob_classify(wordict).prob('neg')
|
||
# Return the formatted scores through the webapp.
|
||
reviewtext = None
|
||
reviewscore = None
|
||
# reviewaccuracy = ( "{0:.2f}%".format(accuracy*100) )
|
||
if classification == 'pos':
|
||
reviewtext = "positiva"
|
||
reviewscore = ( "{0:.2f}%".format(posprob*100) )
|
||
else:
|
||
reviewtext = "negativa"
|
||
reviewscore = ( "{0:.2f}%".format(negprob*100) )
|
||
# return reviewtext, reviewscore, reviewaccuracy
|
||
return reviewtext, reviewscore
|
||
|
||
# Kickstart the Flask webapp.
|
||
app = Flask(__name__)
|
||
|
||
# The home page will be index.html, so references to root will be redirected there.
|
||
@app.route('/', methods=['POST', 'GET'])
|
||
@app.route('/index.html', methods=['POST', 'GET'])
|
||
def index():
|
||
# This will add support for a static stylesheet.
|
||
url_for('static', filename='style.css')
|
||
# Several required definitions.
|
||
error = None
|
||
reviewtext = None
|
||
reviewscore = None
|
||
# reviewaccuracy = None
|
||
# If the user sent data through the form, pass it through the sentiment analysis engine and return the score.
|
||
if request.method == 'POST':
|
||
if request.form['text']:
|
||
# reviewtext, reviewscore, reviewaccuracy = sentimentverify(request.form['text'])
|
||
reviewtext, reviewscore = sentimentverify(request.form['text'])
|
||
else:
|
||
error = 'Se requiere una reseña.'
|
||
# Render the returned data, replacing the review text in the specified places.
|
||
# return render_template('/index.html', reviewtext=reviewtext, reviewscore=reviewscore, reviewaccuracy=reviewaccuracy, error=error)
|
||
return render_template('/index.html', reviewtext=reviewtext, reviewscore=reviewscore, error=error)
|
||
|
||
# Run the webapp in localhost (typically http://localhost:5050 )
|
||
if __name__ == '__main__':
|
||
app.run(host='0.0.0.0', debug=True)
|