sentiment-analysis-corpus-s.../sentiment.py
2017-08-14 12:13:59 -06:00

88 lines
4.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

'''
Dependencies: Python 3, NLTK, Flask. The last two can be installed from Python's PIP.
Bibliography:
[1] Perkins, Jacob. Text Classification for Sentiment Analysis Naive Bayes Classifier, 2010. Retrieved June 30, 2015 from http://streamhacker.com/2010/05/10/text-classification-sentiment-analysis-naive-bayes-classifier/ .
[2] Perkins, Jacob. Python 3 Text Processing with NLTK 3 Cookbook, 2nd edition. Packt Publishing, Birmingham, UK, 2014.
[3] NLTK Project nltk.tokenize package, 2015 Retrieved June 30, 2015 from http://www.nltk.org/api/nltk.tokenize.html .
'''
# Assorted imports required below.
from nltk.corpus.reader import CategorizedPlaintextCorpusReader as corpusreader
from nltk.classify import util, NaiveBayesClassifier
from nltk.tokenize import RegexpTokenizer as tokenizer
from math import floor
from flask import Flask, request, url_for, render_template
# def sentimentprepare():
# The "definition" below runs once when the program starts, and defines the corpus and the classifier.
# trainingpercentage = 80 # % - Set at will here.
# Files are in the format ./pos/file.txt or ./neg/file.txt
# The categorized corpus reader imports the files and assigns them to categories, depending on their original folder.
sentimentcorpus = corpusreader('.', r'(pos|neg)/.*\.txt', cat_pattern=r'(pos|neg)')
# The list of files corresponding to each category is saved in the variables below.
posids = sentimentcorpus.fileids('pos')
negids = sentimentcorpus.fileids('neg')
# Features are based in a bag-of-words model, the contents of all files of each category are imported into a dictionary each. All items are lower-cased for normalization.
posfeatures = [ ( dict( [(word.lower(), True) for word in (sentimentcorpus.words( fileids=[i] )) ] ), 'pos' ) for i in posids]
negfeatures = [ ( dict( [(word.lower(), True) for word in (sentimentcorpus.words( fileids=[i] )) ] ), 'neg' ) for i in negids]
# This is what the training percentage is for: the first N% of elements are taken for training the classifier, the last 100-N% are taken for testing the accuracy.
# trainingfeatures = negfeatures[:floor( len(negfeatures)*trainingpercentage/100 )] + posfeatures[:floor( len(posfeatures)*trainingpercentage/100 )]
trainingfeatures = negfeatures + posfeatures
# testingfeatures = negfeatures[ floor( len(negfeatures)*trainingpercentage/100):] + posfeatures[ floor( len(posfeatures)*trainingpercentage/100):]
# Take the classifier, and train it with the training subset.
classifier = NaiveBayesClassifier.train(trainingfeatures)
# Take the classifier, and verify it against the testing subset.
# accuracy = util.accuracy(classifier, testingfeatures)
def sentimentverify(input):
# The code below allows to convert a plain-text review into the format used by the NLTK corpus reader.
tokenizer('\w+|\$[\d\.]+|\S+').tokenize(input.lower())
# Create a bag-of-words dictionary for the particular review.
wordict = dict( [(word, True) for word in input ] )
# This does the heavy lifting of checking if the review is positive or negative.
classification = classifier.classify(wordict)
# Return the probability of the review of being positive or negative.
posprob = classifier.prob_classify(wordict).prob('pos')
negprob = classifier.prob_classify(wordict).prob('neg')
# Return the formatted scores through the webapp.
reviewtext = None
reviewscore = None
# reviewaccuracy = ( "{0:.2f}%".format(accuracy*100) )
if classification == 'pos':
reviewtext = "positiva"
reviewscore = ( "{0:.2f}%".format(posprob*100) )
else:
reviewtext = "negativa"
reviewscore = ( "{0:.2f}%".format(negprob*100) )
# return reviewtext, reviewscore, reviewaccuracy
return reviewtext, reviewscore
# Kickstart the Flask webapp.
app = Flask(__name__)
# The home page will be index.html, so references to root will be redirected there.
@app.route('/', methods=['POST', 'GET'])
@app.route('/index.html', methods=['POST', 'GET'])
def index():
# This will add support for a static stylesheet.
url_for('static', filename='style.css')
# Several required definitions.
error = None
reviewtext = None
reviewscore = None
# reviewaccuracy = None
# If the user sent data through the form, pass it through the sentiment analysis engine and return the score.
if request.method == 'POST':
if request.form['text']:
# reviewtext, reviewscore, reviewaccuracy = sentimentverify(request.form['text'])
reviewtext, reviewscore = sentimentverify(request.form['text'])
else:
error = 'Se requiere una reseña.'
# Render the returned data, replacing the review text in the specified places.
# return render_template('/index.html', reviewtext=reviewtext, reviewscore=reviewscore, reviewaccuracy=reviewaccuracy, error=error)
return render_template('/index.html', reviewtext=reviewtext, reviewscore=reviewscore, error=error)
# Run the webapp in localhost (typically http://localhost:5050 )
if __name__ == '__main__':
app.run(host='0.0.0.0', debug=True)