add data analysis

This commit is contained in:
Yaman Qalieh 2018-05-25 19:30:45 -04:00
parent 7f7baa5f88
commit 61d0c3cb09
2 changed files with 2079 additions and 8 deletions

2000
SmearcarDB/phoible Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,11 @@
from flask import Flask from flask import Flask
from flask import render_template, jsonify, request from flask import render_template, jsonify, request
from flask_sqlalchemy import SQLAlchemy from flask_sqlalchemy import SQLAlchemy
from numpy.polynomial.polynomial import polyfit
from numpy import corrcoef
import numpy as np
import tkinter
import matplotlib.pyplot as plt
from flask import send_file from flask import send_file
import datetime import datetime
import os import os
@ -41,7 +46,6 @@ class Update(db.Model):
date = db.Column(db.DateTime, nullable=False, date = db.Column(db.DateTime, nullable=False,
default=datetime.datetime.now()) default=datetime.datetime.now())
class Editor(db.Model): class Editor(db.Model):
id = db.Column(db.Integer, primary_key=True, autoincrement=True) id = db.Column(db.Integer, primary_key=True, autoincrement=True)
authority = db.Column(db.Integer, nullable=False, default=1) authority = db.Column(db.Integer, nullable=False, default=1)
@ -49,10 +53,76 @@ class Editor(db.Model):
# 1: Below + create Updates # 1: Below + create Updates
# 2: Edit values and Add files # 2: Edit values and Add files
# 3: No Access # 3: No Access
username = db.Column(db.String(32), nullable=False) username = db.Column(db.String(32), nullable=False)
password = db.Column(db.String(32), nullable=False) password = db.Column(db.String(32), nullable=False)
def rand_jitter(arr):
stdev = .01*(max(arr)-min(arr))
return arr + np.random.randn(len(arr)) * stdev
def uniqueness():
x = []
y = []
languages = Language.query.all()
for phoneme in Phoneme.query.all():
frequencies = Frequency.query.filter_by(phoneme_id=phoneme.id).all()
values = [x.value for x in frequencies]
x.append(len(frequencies) / len(languages))
y.append(sum(values) / len(frequencies))
print(corrcoef(x, y))
b, m = polyfit(x, y, 1)
plt.scatter(rand_jitter(x), y, s=7)
bestfit = [b + m * number for number in x]
plt.plot(x, bestfit, '-')
plt.xlabel("Phoneme Presence in Studied Languages")
plt.ylabel("Average Frequency / %")
plt.title("Figure 1")
plt.show()
def phoneme_rank(scatter=False, detail=1000, textOutput=False, title="Figure 2"):
speakers = {
'Spanish (Castillian)': 46.4,
'English (American)': 308.9,
'Spanish (American)': 435.7,
'Japanese': 128,
'German': 76,
'Arabic': 315,
'Mandarin': 909,
'Portuguese (Brazilian)': 194,
'French': 76.8,
'Hindi': 260,
'Polish': 40.3,
'Samoan': 0.40742,
'Kaiwa': 0.0021,
'Bengali': 243,
'Swedish': 12.8,
'Malay': 60.7,
'Italian': 64.8
}
total = sum(list(speakers.values()))
calculation = sorted([(phoneme.name, sum([frequency.value * speakers[Language.query.filter_by(id=frequency.language_id).first().name] / total for frequency in Frequency.query.filter_by(phoneme_id=phoneme.id).all()])) for phoneme in Phoneme.query.limit(detail).all()], key=lambda x:-x[1])
labels, data = zip(*calculation)
if textOutput:
return labels
if scatter:
plt.yscale("log")
plt.plot(range(len(data)), data)
else:
plt.bar(range(len(data)), data)
plt.xlabel("Phoneme Rank")
plt.ylabel("Frequency weighted by Number of Speakers / %")
plt.title(title)
plt.show()
def phoible_compare():
# lang_id = Language.query.filter_by(name=lang).first().id
with open("phoible", "r") as f:
phoible = f.read().splitlines()
phonemes = [phoneme.name for phoneme in Phoneme.query.all()]
return [x for x in phoible if x in phonemes]
def database(): def database():
final = {'values': []} final = {'values': []}
@ -83,18 +153,19 @@ def phoneme_add(info):
# } # }
phoneme = Phoneme.query.filter_by(name=info['phoneme']).first() phoneme = Phoneme.query.filter_by(name=info['phoneme']).first()
language = Language.query.filter_by(id=info['language_id']).first() language = Language.query.filter_by(id=info['language_id']).first()
if not phoneme:
phoneme = Phoneme(name=info['phoneme'])
db.session.add(phoneme)
link = Frequency.query.filter_by( link = Frequency.query.filter_by(
language_id=language.id, language_id=language.id,
phoneme_id=phoneme.id).first() phoneme_id=phoneme.id).first()
if phoneme and link: if not link:
link.value = info['value']
else:
if not phoneme:
phoneme = Phoneme(name=info['phoneme'])
link = Frequency(value=info['value']) link = Frequency(value=info['value'])
link.phoneme = phoneme link.phoneme = phoneme
language.phonemes.append(link) language.phonemes.append(link)
db.session.add_all([phoneme, link]) db.session.add(link)
else:
link.value = info['value']
def phoneme_remove(info): def phoneme_remove(info):