79 lines
2.4 KiB
Python
79 lines
2.4 KiB
Python
#!/usr/bin/env python2
|
|
"""
|
|
Using custom colors
|
|
====================
|
|
Using the recolor method and custom coloring functions.
|
|
"""
|
|
|
|
import feedparser
|
|
import os
|
|
import json
|
|
import sys
|
|
import string
|
|
|
|
from os import path
|
|
from scipy.misc import imread
|
|
from wordcloud import WordCloud, STOPWORDS, get_single_color_func
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
def scrapenwrite(feeds, output_dir):
|
|
try:
|
|
os.mkdir(output_dir)
|
|
except OSError:
|
|
pass
|
|
|
|
with open(path.join(output_dir, 'allposts.txt'), "w") as g:
|
|
for feed in feeds:
|
|
d = feedparser.parse(feed)
|
|
with open(path.join(output_dir, d.feed.title + '.txt').encode('utf8'), "w") as f:
|
|
for item in d.entries:
|
|
soup = BeautifulSoup(item.summary)
|
|
contents = "\n".join(soup.stripped_strings)
|
|
f.write(contents.encode('utf8'))
|
|
g.write(contents.encode('utf8'))
|
|
|
|
|
|
def generate_word_cloud(text, mask_filename, output_image, stop_words,
|
|
max_words=1000):
|
|
d = path.dirname(__file__) # get basename to prepend to mask_filename
|
|
mask = imread(path.join(d, mask_filename))
|
|
|
|
# adding specific stopwords
|
|
stopwords = STOPWORDS.copy()
|
|
for word in stop_words:
|
|
stopwords.add(word)
|
|
for letter in string.letters:
|
|
stopwords.add(letter)
|
|
|
|
wc = WordCloud(max_words=max_words, mask=mask, stopwords=stopwords,
|
|
margin=10, random_state=1).generate(text)
|
|
|
|
wc.recolor(color_func=get_single_color_func('grey'), random_state=3)
|
|
|
|
wc.to_file(output_image)
|
|
|
|
with open(sys.argv[1]) as config_file:
|
|
conf = json.load(config_file)
|
|
|
|
scrapenwrite(feeds=conf['feeds'], output_dir=conf['output_dir'])
|
|
if conf.get('each_corpi'):
|
|
files = os.listdir(conf['output_dir'])
|
|
else:
|
|
files = ['allposts.txt']
|
|
for filename in files:
|
|
if filename[-4:] != '.txt':
|
|
continue
|
|
if filename == 'allposts.txt':
|
|
output_image = conf['output_image']
|
|
else:
|
|
output_image = filename + ".png"
|
|
with open(path.join(conf['output_dir'], filename)) as corpus:
|
|
text = corpus.read()
|
|
generate_word_cloud(
|
|
text=text,
|
|
mask_filename=conf['mask_filename'],
|
|
output_image=path.join(conf['output_dir'], output_image),
|
|
stop_words=conf['stop_words'],
|
|
max_words=conf['max_words'],
|
|
)
|