feedcloud/gencloud.py

79 lines
2.4 KiB
Python

#!/usr/bin/env python2
"""
Using custom colors
====================
Using the recolor method and custom coloring functions.
"""
import feedparser
import os
import json
import sys
import string
from os import path
from scipy.misc import imread
from wordcloud import WordCloud, STOPWORDS, get_single_color_func
from bs4 import BeautifulSoup
def scrapenwrite(feeds, output_dir):
try:
os.mkdir(output_dir)
except OSError:
pass
with open(path.join(output_dir, 'allposts.txt'), "w") as g:
for feed in feeds:
d = feedparser.parse(feed)
with open(path.join(output_dir, d.feed.title + '.txt').encode('utf8'), "w") as f:
for item in d.entries:
soup = BeautifulSoup(item.summary)
contents = "\n".join(soup.stripped_strings)
f.write(contents.encode('utf8'))
g.write(contents.encode('utf8'))
def generate_word_cloud(text, mask_filename, output_image, stop_words,
max_words=1000):
d = path.dirname(__file__) # get basename to prepend to mask_filename
mask = imread(path.join(d, mask_filename))
# adding specific stopwords
stopwords = STOPWORDS.copy()
for word in stop_words:
stopwords.add(word)
for letter in string.letters:
stopwords.add(letter)
wc = WordCloud(max_words=max_words, mask=mask, stopwords=stopwords,
margin=10, random_state=1).generate(text)
wc.recolor(color_func=get_single_color_func('grey'), random_state=3)
wc.to_file(output_image)
with open(sys.argv[1]) as config_file:
conf = json.load(config_file)
scrapenwrite(feeds=conf['feeds'], output_dir=conf['output_dir'])
if conf.get('each_corpi'):
files = os.listdir(conf['output_dir'])
else:
files = ['allposts.txt']
for filename in files:
if filename[-4:] != '.txt':
continue
if filename == 'allposts.txt':
output_image = conf['output_image']
else:
output_image = filename + ".png"
with open(path.join(conf['output_dir'], filename)) as corpus:
text = corpus.read()
generate_word_cloud(
text=text,
mask_filename=conf['mask_filename'],
output_image=path.join(conf['output_dir'], output_image),
stop_words=conf['stop_words'],
max_words=conf['max_words'],
)