From 7d8c5a9e7501b924e26403d681d2197318437dc6 Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jan 27 2022 22:02:28 +0000 Subject: [PATCH 1/49] Wait no -- THIS is the starting point. Don't dig below here. Merge branch 'parameterizing' into default --- diff --git a/NOTES.md b/NOTES.md index 2769b7a..bd42553 100644 --- a/NOTES.md +++ b/NOTES.md @@ -14,6 +14,3 @@ silly ones. We do still keep the architecture match, though. We could do some other interesting reports on the outlier data — queries that come from different OSes, or across architectures. These are usually cross-compile builds or container cases. - -The database uses "passel" instead of "group" because SQL is silly. I try -not to expose that to... anything visible outside the code. \ No newline at end of file diff --git a/TODO.md b/TODO.md index 45caae0..2f78dc7 100644 --- a/TODO.md +++ b/TODO.md @@ -1,16 +1,3 @@ -* OK FINE USE "dataset" instead of "passel" - -* First thing: change the format so "group" is a thing instead of separate - tables. - -* GALAXY BRAIN: render charts from the nc files rather than having - a csv intermediary. then we won't have to split out ephemeral/persistent - (Earlier merely big brain: I was thinking that different data sets like - epel and fedora-updates would have very different reports. But I - decided to make them the same after all. So it's just an extra step.) - -* still write out some CSV files because they're handy! - * text reports!!! * this week / last week / year-over-year * total systems / total persistent / total ephemeral (+%) @@ -25,6 +12,9 @@ * architecture as above * share of category (desktop/server-cloud-iot/labs) +* use jinjasql for the query templates! +* sanitize everything coming from config.toml, really. + * for the slicer, put the groups in their definitions in the config.toml * be smarter about which timeseries to make @@ -76,6 +66,10 @@ * desktop,server +* I guess we should make it so the timeseries definitions can loop over multiple datasets to avoid + a lot of redundancy. Or at least, to apply to all Fedora datasets? (Yes, that: introduce a "distro" grouping.) + - need a way to actually include multiple datasets at once though, like for the fedora linux + epel graph + * predefined colors for some things * fix it so colors don't overlap when there's more than 12 options. @@ -90,6 +84,8 @@ * something is messed up with the old waffle chart code. throw away, start again +* sanatize all values read from config.toml + * useful waffle charts (show current week, maybe average last 2-4): * full [arch,variant,release] (with different shape for ephemeral!) * Breakouts (multiple charts per file?) diff --git a/brontosaurus-egg-sorter.py b/brontosaurus-egg-sorter.py index d26b98e..e8dff2e 100755 --- a/brontosaurus-egg-sorter.py +++ b/brontosaurus-egg-sorter.py @@ -104,7 +104,7 @@ onecounter = Counter() loopcursor.execute( - "SELECT DISTINCT(passel) FROM checkins ORDER BY passel DESC") + "SELECT DISTINCT(dataset) FROM checkins ORDER BY dataset DESC") groups = [item for sublist in loopcursor.fetchall() for item in sublist] @@ -113,7 +113,7 @@ for group in groups: onecounter.clear() loopcursor.execute( - 'SELECT * FROM checkins WHERE passel = :passel AND age = 1 ORDER BY week', {'passel': group}) + 'SELECT * FROM checkins WHERE dataset = :dataset AND age = 1 ORDER BY week', {'dataset': group}) for row in loopcursor: (week, group, release, variant, arch, age, hits) = row @@ -122,7 +122,7 @@ for group in groups: # get the other age groups for this type of system, if any query = """SELECT age,hits FROM checkins WHERE week = :week AND - passel = :passel AND + dataset = :dataset AND release = :release AND variant = :variant AND arch = :arch AND @@ -131,7 +131,7 @@ for group in groups: """ nextcursor.execute(query, {"week": week, - "passel": group, + "dataset": group, "release": release, "variant": variant, "arch": arch @@ -158,7 +158,7 @@ for group in groups: if group.split('_', 1)[0] == 'fedora': query = """SELECT age,sum(hits) FROM checkins WHERE week = :nextweek AND - passel = :passel AND + dataset = :dataset AND release >= :release AND variant = :variant AND arch = :arch AND @@ -168,7 +168,7 @@ for group in groups: else: query = """SELECT age,sum(hits) FROM checkins WHERE week = :nextweek AND - passel = :passel AND + dataset = :dataset AND release = :release AND variant = :variant AND arch = :arch AND @@ -177,7 +177,7 @@ for group in groups: ORDER BY age""" nextcursor.execute(query, {"nextweek": (datetime.datetime.fromisoformat(week) + datetime.timedelta(weeks=1)).strftime("%Y-%m-%d"), # this was easier when it was weeknums! - "passel": group, + "dataset": group, "release": release, "variant": variant, "arch": arch @@ -248,10 +248,10 @@ for group in groups: new_zero, new_one, thisone) nextcursor.execute("""INSERT INTO checkins - (week, passel, release, variant, arch, age, hits) - VALUES (:week, :passel, :release, :variant, :arch, :age, :hits)""", + (week, dataset, release, variant, arch, age, hits) + VALUES (:week, :dataset, :release, :variant, :arch, :age, :hits)""", {"week": week, - "passel": group, + "dataset": group, "release": release, "variant": variant, "arch": arch, @@ -259,10 +259,10 @@ for group in groups: "hits": new_zero }) nextcursor.execute("""REPLACE INTO checkins - (week, passel, release, variant, arch, age, hits) - VALUES (:week, :passel, :release, :variant, :arch, :age, :hits)""", + (week, dataset, release, variant, arch, age, hits) + VALUES (:week, :dataset, :release, :variant, :arch, :age, :hits)""", {"week": week, - "passel": group, + "dataset": group, "release": release, "variant": variant, "arch": arch, diff --git a/brontosaurus-plotter.py b/brontosaurus-plotter.py index 42b07e8..c3aa3a8 100755 --- a/brontosaurus-plotter.py +++ b/brontosaurus-plotter.py @@ -2,9 +2,9 @@ import matplotlib.dates as dates import matplotlib.pyplot as plt + import sqlite3 -import os -import re +from string import Template from collections import defaultdict from collections import OrderedDict @@ -25,15 +25,9 @@ m.rcParams['font.size'] = 12 m.rcParams['font.family'] = 'Montserrat' m.rcParams['legend.frameon'] = False -# fix me: define pretty labels in view_defaults -AGE_LABELS = {'0': 'Ephemeral', - '1': 'First week', - '2': '2-4 weeks', - '3': '5-24 weeks', - '4': '25+ weeks'} - def get_colors(colormappings, colorlist, dataset, dataseries, items): + """This makes colors 'sticky' for the whole run.""" key = dataset + '_' + dataseries @@ -49,260 +43,141 @@ def get_colors(colormappings, colorlist, dataset, dataseries, items): return outcolors -def graph_timeseries(view, dataframe, colormappings, dataset, dataseries): +def graph_timeseries(config, colormappings, params, dataframe): + """Draws line or area chart for a dataseries over time.""" - # If we find we have missing data, in the future + # If we find we have missing data, in the future: # dataframe.resample('W-MON') + dataset = params['dataset'] + dataseries = params['dataseries'] + ################# - # cull the weak - # fixme: accumlate these into "other" + # Instead of this, accumulate anything more than 10 into "other" + # ... and do it _elsewhere_ (easier to do before pivot anyway!) # + limit number of columns to 10 + other hidelist = dataframe.div(dataframe.sum( - axis=1), axis=0).max() < view['hidepercent']/100 + axis=1), axis=0).max() < 0.2/100 dataframe.drop(columns=(hidelist[hidelist == True].keys()), inplace=True) ################## # our colors. # the complication here is keeping the same color for the same label # across multiple graphs! - cmap = m.colors.ListedColormap(get_colors(colormappings, view['colors'], + cmap = m.colors.ListedColormap(get_colors(colormappings, config['colors'], dataset, dataseries, list(dataframe.columns))) - cmap_r = m.colors.ListedColormap(cmap.colors[::-1]) ################## # and now.... graph it! # FIXME: this is ugly - startdate = view['startdate'][dataset.split('_', 1)[0]] - - # lines - graph = dataframe[startdate:].plot(figsize=view['figsize'], colormap=cmap) - - ax = plt.gca() - handles, labels = ax.get_legend_handles_labels() - if dataseries == 'age': - labels = list(map(AGE_LABELS.get, labels)) - plt.legend(handles, labels, loc='center left', bbox_to_anchor=(1.0, 0.5)) - - plt.suptitle(dataset + ": " + - dataseries + " over time", fontsize=24) - if view['ephemeral'] != 'all': - graph.set_title(view['ephemeral'] + " systems", fontsize=14) - - plt.autoscale(enable=True, axis='x', tight=True) - plt.autoscale(enable=True, axis='y', tight=False) - graph.set_ylim([0, None]) - graph.spines['right'].set_visible(False) - graph.spines['top'].set_visible(False) - sFormatter = m.ticker.ScalarFormatter() - sFormatter.set_scientific(False) - graph.yaxis.set_major_formatter(sFormatter) - # graph.xaxis.set_major_formatter(dates.DateFormatter('%b \'%y')) - graph.set_xlabel('') - - graph.figure.savefig('images/svg/' + dataset + '/' + dataset + '-timeseries-' + dataseries + - '-' + view['ephemeral'] + '.svg', dpi=view['dpi'], bbox_inches="tight") - graph.figure.savefig('images/png/' + dataset + '/' + dataset + '-timeseries-' + dataseries + - '-' + view['ephemeral'] + '.png', dpi=view['dpi'], bbox_inches="tight") - - plt.close(graph.figure) - - # stacked - - reversed = dataframe[dataframe.columns[::-1]] - graph = reversed[startdate:].plot( - figsize=view['figsize'], colormap=cmap_r, kind='area') - - ax = plt.gca() - handles, labels = ax.get_legend_handles_labels() - if dataseries == 'age': - labels = list(map(AGE_LABELS.get, labels)) - plt.legend(handles[::-1], labels[::-1], - loc='center left', bbox_to_anchor=(1.0, 0.5)) - - plt.suptitle(dataset + ": " + - dataseries + " over time (stacked)", fontsize=24) - if view['ephemeral'] != 'all': - graph.set_title(view['ephemeral'] + " systems", fontsize=14) - - plt.autoscale(enable=True, axis='x', tight=True) - plt.autoscale(enable=True, axis='y', tight=False) - graph.set_ylim([0, None]) - graph.spines['right'].set_visible(False) - graph.spines['top'].set_visible(False) - # graph.xaxis.set_major_formatter(dates.DateFormatter('%b \'%y')) - graph.set_xlabel('') - - graph.figure.savefig('images/svg/' + dataset + '/' + dataset + '-timeseries-' + dataseries + - '-' + view['ephemeral'] + '-stacked.svg', dpi=view['dpi'], bbox_inches="tight") - graph.figure.savefig('images/png/' + dataset + '/' + dataset + '-timeseries-' + dataseries + - '-' + view['ephemeral'] + '-stacked.png', dpi=view['dpi'], bbox_inches="tight") - - plt.close(graph.figure) - - # area (share / percent) - - percentframe = dataframe.div(dataframe.sum(axis=1), axis=0)*100 - - graph = percentframe[startdate:].plot( - figsize=view['figsize'], colormap=cmap, kind='area') - - ax = plt.gca() - handles, labels = ax.get_legend_handles_labels() - if dataseries == 'age': - labels = list(map(AGE_LABELS.get, labels)) - plt.legend(handles, labels, loc='center left', bbox_to_anchor=(1.0, 0.5)) - - plt.suptitle(dataset + ": " + - dataseries + " over time (share)", fontsize=24) - if view['ephemeral'] != 'all': - graph.set_title(view['ephemeral'] + " systems", fontsize=14) - - plt.autoscale(enable=True, axis='x', tight=True) - plt.autoscale(enable=True, axis='y', tight=False) - graph.set_ylim([0, None]) - graph.spines['right'].set_visible(False) - graph.spines['top'].set_visible(False) - # graph.xaxis.set_major_formatter(dates.DateFormatter('%b \'%y')) - graph.set_xlabel('') - - graph.figure.savefig('images/svg/' + dataset + '/' + dataset + '-timeseries-' + dataseries + - '-' + view['ephemeral'] + '-share.svg', dpi=view['dpi'], bbox_inches="tight") - graph.figure.savefig('images/png/' + dataset + '/' + dataset + '-timeseries-' + dataseries + - '-' + view['ephemeral'] + '-share.png', dpi=view['dpi'], bbox_inches="tight") - - plt.close(graph.figure) - - -""" -def graph_average(view, colormappings): - - if view['ephemeral'] == 'all': - csvfile = 'csv/' + dataset + '-average-' + \ - view['columns'] + '-by-' + view['rows'] + '.csv' - elif view['ephemeral'] == 'ephemeral': - csvfile = 'csv/' + dataset + '-average-' + \ - view['columns'] + '-by-' + view['rows'] + '-ephemeral.csv' - elif view['ephemeral'] == 'persistent': - csvfile = 'csv/' + dataset + '-average-' + \ - view['columns'] + '-by-' + view['rows'] + '-persistent.csv' - else: - print("Invalid value for 'ephemeral'") - print(csvfile) - - dataframe = pd.read_csv(csvfile, header='>', index_col=0) - - for index, row in dataframe.iterrows(): - - # values which are less than one box are gonna get combined - otherlist = row.div(row.sum()) * \ - (view['waffle']['rows']*view['waffle']['rows']) < 1 - otherval = otherval = row[otherlist[otherlist == True].keys()].sum() - prunedrow = row.loc[otherlist[otherlist == False].keys()] - # if the sum of the discards is big enough to make a block, add an "other" row - if (otherval / (prunedrow.sum() + otherval)) * (view['waffle']['rows']*view['waffle']['rows']) >= 1: - # print("Debug: adding 'other' entry.") - prunedrow['other'] = otherval - - # this makes sure we sort these things by number, rather than by weight - # fixme: don't special case here; instead, put sort options in the view definition - if view['rows'] == 'age' or view['rows'] == 'release': - data = dict(prunedrow.sort_index(ascending=True)) - else: - data = dict(prunedrow.sort_values(ascending=True)) - - # keep colors consistent per label - colors = get_colors(colormappings, view['colors'], - dataset, view['rows'], data.keys()) - - # make the age labels human-readable. - # is this the right place to do this? probably not. - if view['rows'] == 'age': - data = dict(zip(AGE_LABELS.values(), data.values())) - - if view['ephemeral'] != 'all': - label = f"{dataset}: {view['rows']} for {view['columns']} {index} ({view['ephemeral']})" - else: - label = f"{dataset}: {view['rows']} for {view['columns']} {index}" - - total = sum(list(data.values())) - fig = plt.figure( - FigureClass=Waffle, - rows=view['waffle']['rows'], - columns=view['waffle']['columns'], - values=data, - figsize=view['figsize'], - rounding_rule='ceil', - colors=colors, - title={'label': label, - 'loc': 'center', - 'pad': 48, - 'fontdict': {'fontsize': 24} - }, - legend={ - 'labels': [f"{k} ({v*100/total:.1f}%)" for k, v in data.items()], - 'loc': 'lower center', - 'ncol': 5, - 'framealpha': 0, - 'bbox_to_anchor': (0.5, -0.2), - 'fontsize': 14 - } - - ) + startdate = config['startdate'][dataset.split('_', 1)[0]] + + for view in params['views']: + + match view: + case 'line': + df = dataframe[startdate:] + kind = 'line' + colormap = cmap + case 'stacked': + df = dataframe[startdate:][dataframe.columns[::-1]] + kind = 'area' + colormap = m.colors.ListedColormap(cmap.colors[::-1]) + case 'share': + df = dataframe[startdate:].div( + dataframe.sum(axis=1), axis=0)*100 + kind = 'area' + colormap = cmap + + # Start the actual graph + graph = df.plot(figsize=config['figsize'], + colormap=colormap, kind=kind) + + # Labels and titles and stuff. + ax = plt.gca() + + handles, labels = ax.get_legend_handles_labels() + + # TODO: generalize this + if dataseries == 'age': + labels = list(map(config['age_labels'].get, labels)) + + if view == 'stacked': + handles[:] = handles[::-1] + labels[:] = labels[::-1] + + plt.legend(handles, labels, loc='center left', + bbox_to_anchor=(1.0, 0.5)) + + madlibs = {'dataseries': dataseries, + 'dataset': dataset, + 'view': view, + 'dataseries_label': config['dataseries_labels'][dataseries], + 'dataset_label': config['dataset_labels'][dataset], + 'view_label': config['view_labels'][view]} + + if 'title' in params: + plt.suptitle(Template(params['title']).safe_substitute(madlibs), + fontsize=24) + + # FIX: make work + if 'subtitle' in params: + graph.set_title( + Template(params['subtitle']).safe_substitute(madlibs), + fontsize=14) + + plt.autoscale(enable=True, axis='x', tight=True) + plt.autoscale(enable=True, axis='y', tight=False) + graph.set_ylim([0, None]) + graph.spines['right'].set_visible(False) + graph.spines['top'].set_visible(False) + sFormatter = m.ticker.ScalarFormatter() + sFormatter.set_scientific(False) + graph.yaxis.set_major_formatter(sFormatter) + # graph.xaxis.set_major_formatter(dates.DateFormatter('%b \'%y')) + graph.set_xlabel('') + + for ext in config['image_types']: + graph.figure.savefig(f"images/{ext}/{dataset}/{Template(params['filebase']).safe_substitute(madlibs)}.{ext}", + dpi=config['dpi'], bbox_inches="tight") + + plt.close(graph.figure) + print(f"{Template(params['filebase']).safe_substitute(madlibs)}.{ext}") - safeindex = re.sub('[\W_-]', '', str(index)) - basename = dataset + '-waffle-' + view['rows'] + \ - '-for-' + view['columns'] + '-' + \ - safeindex + '-' + view['ephemeral'] - - fig.savefig('images/svg/' + - dataset + '/' + basename + '.svg', dpi=view['dpi']) - fig.savefig('images/png/' + - dataset + '/' + basename + '.png', dpi=view['dpi']) - - plt.close() -""" ########################################### def main(): - defaults = toml.load("view-defaults.toml") + config = toml.load("config.toml") colormappings = defaultdict(OrderedDict) database = sqlite3.connect(DATAFILE, detect_types=sqlite3.PARSE_DECLTYPES) cursor = database.cursor() - cursor.execute( - "SELECT DISTINCT(passel) FROM checkins ORDER BY passel DESC") - groups = [item for sublist in cursor.fetchall() for item in sublist] - - for group in groups: - for column in ["release", "variant", "age", "arch"]: - # f-string normally dangerous but here we are using - # the hard-coded values above, and "group" is also - # something we control. - df = pd.read_sql_query(f"""SELECT - week, - {column}, - SUM(hits) as hits - FROM checkins - WHERE passel=\"{group}\" - GROUP BY week,{column} - ORDER BY week""", - parse_dates='week', - con=database) - - graph_timeseries( - defaults, - colormappings=colormappings, - dataframe=df.pivot(index='week', columns=column, - values='hits').astype("Int64"), - dataset=group, - dataseries=column - ) + + for timeseries in config['timeseries']: + params = config['timeseries_defaults'].copy() + params.update(timeseries) + + query = f"""SELECT week, {params['dataseries']}, SUM(hits) as hits + FROM checkins + WHERE dataset=\"{params['dataset']}\" + {params['extraselect']} + GROUP BY week,{params['dataseries']} + ORDER BY week + """ + df = pd.read_sql_query(query, parse_dates='week', con=database) + + graph_timeseries( + config=config, + colormappings=colormappings, + params=params, + dataframe=df.pivot(index='week', columns=params['dataseries'], + values='hits').astype("Int64"), + ) if __name__ == "__main__": diff --git a/brontosaurus-slicer.sh b/brontosaurus-slicer.sh index 258f663..8235990 100755 --- a/brontosaurus-slicer.sh +++ b/brontosaurus-slicer.sh @@ -5,7 +5,7 @@ # # It splits the records into major groups: EPEL, and then also # "main" Fedora Linux systems, Fedora Rawhide, and Fedora Containers. -# Because "group" is a reserved word in sql, we use "passel". +# Because "group" is a reserved word in sql, we use "dataset". # # It removes the os_ prefix, because without repo_ columns there # is no ambiguity to resolve. @@ -27,18 +27,18 @@ DROP TABLE IF EXISTS bronto.checkins; CREATE TABLE bronto.checkins( week INT, - passel TEXT, + dataset TEXT, release TEXT, variant TEXT, arch TEXT, age INT CHECK(age<5), hits INT, - UNIQUE (week,passel,release,variant,arch,age) + UNIQUE (week,dataset,release,variant,arch,age) ); INSERT INTO bronto.checkins SELECT date(julianday('1970-01-05')+ weeknum * 7) as week, - "fedora_updates_systems" AS passel, + "fedora_updates_systems" AS dataset, os_version AS release, os_variant AS variant, os_arch AS arch, @@ -56,7 +56,7 @@ INSERT INTO bronto.checkins INSERT INTO bronto.checkins SELECT date(julianday('1970-01-05')+ weeknum * 7) as week, - "fedora_updates_containers" AS passel, + "fedora_updates_containers" AS dataset, os_version AS release, os_variant AS variant, os_arch AS arch, @@ -75,7 +75,7 @@ INSERT INTO bronto.checkins INSERT INTO bronto.checkins SELECT date(julianday('1970-01-05')+ weeknum * 7) as week, - "fedora_rawhide_systems" AS passel, + "fedora_rawhide_systems" AS dataset, os_version AS release, os_variant AS variant, os_arch AS arch, @@ -92,7 +92,7 @@ INSERT INTO bronto.checkins INSERT INTO bronto.checkins SELECT date(julianday('1970-01-05')+ weeknum * 7) as week, - "fedora_rawhide_containers" AS passel, + "fedora_rawhide_containers" AS dataset, os_version AS release, os_variant AS variant, os_arch AS arch, @@ -110,7 +110,7 @@ INSERT INTO bronto.checkins INSERT INTO bronto.checkins SELECT date(julianday('1970-01-05')+ weeknum * 7) as week, - "epel" AS passel, + "epel" AS dataset, CASE instr(os_version,".") WHEN 0 THEN os_version ELSE substr(os_version,0,instr(os_version,".")) diff --git a/brontosaurus-washer.sh b/brontosaurus-washer.sh index e6a1b5f..8dad9ef 100755 --- a/brontosaurus-washer.sh +++ b/brontosaurus-washer.sh @@ -40,10 +40,10 @@ FEDORA_STARTDAY='2020-04-27' EPEL_STARTVER=8 EPEL_STARTDAY='2021-01-01' sqlite3 db/bronto.db << EOF - DELETE FROM checkins WHERE passel GLOB "fedora*" AND release < $FEDORA_STARTVER; - DELETE FROM checkins WHERE passel GLOB "fedora*" AND week < "$FEDORA_STARTDAY"; - DELETE FROM checkins WHERE passel GLOB "epel*" AND release < $EPEL_STARTVER; - DELETE FROM checkins WHERE passel GLOB "epel*" AND week < "$EPEL_STARTDAY"; + DELETE FROM checkins WHERE dataset GLOB "fedora*" AND release < $FEDORA_STARTVER; + DELETE FROM checkins WHERE dataset GLOB "fedora*" AND week < "$FEDORA_STARTDAY"; + DELETE FROM checkins WHERE dataset GLOB "epel*" AND release < $EPEL_STARTVER; + DELETE FROM checkins WHERE dataset GLOB "epel*" AND week < "$EPEL_STARTDAY"; EOF # Clean up entries for name, arch, or release that show up @@ -59,13 +59,13 @@ EOF THRESHOLD_TOTAL=100 THRESHOLD_WEEKLY=3 -for GROUP in $(echo 'SELECT DISTINCT(passel) FROM checkins;' | sqlite3 ./db/bronto.db); do +for GROUP in $(echo 'SELECT DISTINCT(dataset) FROM checkins;' | sqlite3 ./db/bronto.db); do sqlite3 db/bronto.db << EOF - DELETE FROM checkins WHERE passel = "$GROUP" AND variant IN (SELECT variant FROM checkins WHERE passel = "$GROUP" GROUP BY variant HAVING SUM(hits) < $THRESHOLD_TOTAL); - DELETE FROM checkins WHERE passel = "$GROUP" AND variant IN (SELECT variant FROM checkins WHERE passel = "$GROUP" GROUP BY variant HAVING SUM(hits) < $THRESHOLD_WEEKLY); - DELETE FROM checkins WHERE passel = "$GROUP" AND arch IN (SELECT arch FROM checkins WHERE passel = "$GROUP" GROUP BY arch HAVING SUM(hits) < $THRESHOLD_TOTAL); - DELETE FROM checkins WHERE passel = "$GROUP" AND arch IN (SELECT arch FROM checkins WHERE passel = "$GROUP" GROUP BY arch HAVING SUM(hits) < $THRESHOLD_WEEKLY); - DELETE FROM checkins WHERE passel = "$GROUP" AND release IN (SELECT release FROM checkins WHERE passel = "$GROUP" GROUP BY release HAVING SUM(hits) < $THRESHOLD_TOTAL); - DELETE FROM checkins WHERE passel = "$GROUP" AND release IN (SELECT release FROM checkins WHERE passel = "$GROUP" GROUP BY release HAVING SUM(hits) < $THRESHOLD_WEEKLY); + DELETE FROM checkins WHERE dataset = "$GROUP" AND variant IN (SELECT variant FROM checkins WHERE dataset = "$GROUP" GROUP BY variant HAVING SUM(hits) < $THRESHOLD_TOTAL); + DELETE FROM checkins WHERE dataset = "$GROUP" AND variant IN (SELECT variant FROM checkins WHERE dataset = "$GROUP" GROUP BY variant HAVING SUM(hits) < $THRESHOLD_WEEKLY); + DELETE FROM checkins WHERE dataset = "$GROUP" AND arch IN (SELECT arch FROM checkins WHERE dataset = "$GROUP" GROUP BY arch HAVING SUM(hits) < $THRESHOLD_TOTAL); + DELETE FROM checkins WHERE dataset = "$GROUP" AND arch IN (SELECT arch FROM checkins WHERE dataset = "$GROUP" GROUP BY arch HAVING SUM(hits) < $THRESHOLD_WEEKLY); + DELETE FROM checkins WHERE dataset = "$GROUP" AND release IN (SELECT release FROM checkins WHERE dataset = "$GROUP" GROUP BY release HAVING SUM(hits) < $THRESHOLD_TOTAL); + DELETE FROM checkins WHERE dataset = "$GROUP" AND release IN (SELECT release FROM checkins WHERE dataset = "$GROUP" GROUP BY release HAVING SUM(hits) < $THRESHOLD_WEEKLY); EOF done diff --git a/config.toml b/config.toml new file mode 100644 index 0000000..a6c5c58 --- /dev/null +++ b/config.toml @@ -0,0 +1,437 @@ +ephemeral = "all" + +figsize = [16, 9] +dpi = 300 + + +# Our palette. Note that this also limits the +# number of items per chart. If there are +# more than the number of colors, the last +# color here becomes "other". +# (TODO! Implement that!) +colors = [ + '#51a2da', + '#294172', + '#afea85', + '#db3279', + '#f5a326', + '#b193c8', + '#38bc3b', + '#3c6eb4', + '#eb7434', + '#603e79', + '#ffd117', + '#aad0ee', + '#101010', + '#535961', +] + +# could be png, pdf, svg +# TODO: not yet implemented +image_types = ["png"] + +# todo: change the fedora start date to also 2021-01-01, after DevConf.cz 2022. +[startdate] +fedora = '2020-04-27' # week of Fedora 32 release +epel = '2021-01-01' # DNF feature launched in 8.3 at end of 2020 + +[dataset_labels] +epel = "Extra Packages for Enterprise Linux" +fedora_updates_systems = "Fedora Linux systems" +fedora_updates_containers = "Fedora Linux containers" +fedora_rawhide_systems = "Fedora Rawhide systems" +fedora_rawhide_containers = "Fedora Rawhide containers" + +[dataseries_labels] +arch="CPU architecture" +release="release " +variant="variant" +age="age category" + +[age_labels] +'0'='Ephemeral' +'1'='First week' +'2'='2-4 weeks' +'3'='5-24 weeks' +'4'='25+ weeks' + +[view_labels] +'line'="" +'stacked'=" (stacked)" +'share'=" (share)" + +[timeseries_defaults] +title="$dataset_label: weekly checkins by $dataseries_label$view_label" +filebase="$dataset-timeseries-$dataseries-$view" +extraselect="" +# not all of these are implemented. But we could have... +#subtitle= +#dataset= +#dataseries= +#orderbyhits= +#reverse= +# todo: back to the idea of reading these +# from individual, merged configuration files! + +[[timeseries]] +dataset="fedora_updates_systems" +dataseries="release" +views=['line','stacked','share'] +filebase="$dataset-timeseries-$dataseries-$view-all" + +[[timeseries]] +subtitle="ephemeral systems" +dataset="fedora_updates_systems" +dataseries="release" +views=['line','stacked','share'] +extraselect="AND age=0" +filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +[[timeseries]] +subtitle="persistent systems" +dataset="fedora_updates_systems" +dataseries="release" +views=['line','stacked','share'] +extraselect="AND age>0" +filebase="$dataset-timeseries-$dataseries-$view-persistent" + +[[timeseries]] +dataset="fedora_updates_systems" +dataseries="age" +views=['share','stacked'] +filebase="$dataset-timeseries-$dataseries-$view-all" + +[[timeseries]] +dataset="fedora_updates_systems" +dataseries="arch" +views=['line','share'] +filebase="$dataset-timeseries-$dataseries-$view-all" + +[[timeseries]] +subtitle="ephemeral systems" +dataset="fedora_updates_systems" +dataseries="arch" +views=['line','share'] +extraselect="AND age=0" +filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +[[timeseries]] +subtitle="persistent systems" +dataset="fedora_updates_systems" +dataseries="arch" +views=['line','share'] +extraselect="AND age>0" +filebase="$dataset-timeseries-$dataseries-$view-persistent" + +[[timeseries]] +dataset="fedora_updates_systems" +dataseries="variant" +views=['line','share'] +filebase="$dataset-timeseries-$dataseries-$view-all" + +[[timeseries]] +subtitle="ephemeral systems" +dataset="fedora_updates_systems" +dataseries="variant" +views=['line','share'] +extraselect="AND age=0" +filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +[[timeseries]] +subtitle="persistent systems" +dataset="fedora_updates_systems" +dataseries="variant" +views=['line','share'] +extraselect="AND age>0" +filebase="$dataset-timeseries-$dataseries-$view-persistent" + +[[timeseries]] +dataset="fedora_updates_containers" +dataseries="release" +views=['line','stacked','share'] +filebase="$dataset-timeseries-$dataseries-$view-all" + +[[timeseries]] +subtitle="ephemeral systems" +dataset="fedora_updates_containers" +dataseries="release" +views=['line','stacked','share'] +extraselect="AND age=0" +filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +[[timeseries]] +subtitle="persistent systems" +dataset="fedora_updates_containers" +dataseries="release" +views=['line','stacked','share'] +extraselect="AND age>0" +filebase="$dataset-timeseries-$dataseries-$view-persistent" + +[[timeseries]] +dataset="fedora_updates_containers" +dataseries="age" +views=['share','stacked'] +filebase="$dataset-timeseries-$dataseries-$view-all" + +[[timeseries]] +dataset="fedora_updates_containers" +dataseries="arch" +views=['line','share'] +filebase="$dataset-timeseries-$dataseries-$view-all" + +[[timeseries]] +subtitle="ephemeral systems" +dataset="fedora_updates_containers" +dataseries="arch" +views=['line','share'] +extraselect="AND age=0" +filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +[[timeseries]] +subtitle="persistent systems" +dataset="fedora_updates_containers" +dataseries="arch" +views=['line','share'] +extraselect="AND age>0" +filebase="$dataset-timeseries-$dataseries-$view-persistent" + +[[timeseries]] +dataset="fedora_updates_containers" +dataseries="variant" +views=['line','share'] +filebase="$dataset-timeseries-$dataseries-$view-all" + +[[timeseries]] +subtitle="ephemeral systems" +dataset="fedora_updates_containers" +dataseries="variant" +views=['line','share'] +extraselect="AND age=0" +filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +[[timeseries]] +subtitle="persistent systems" +dataset="fedora_updates_containers" +dataseries="variant" +views=['line','share'] +extraselect="AND age>0" +filebase="$dataset-timeseries-$dataseries-$view-persistent" + + +[[timeseries]] +dataset="fedora_rawhide_systems" +dataseries="release" +views=['line','stacked','share'] +filebase="$dataset-timeseries-$dataseries-$view-all" + +[[timeseries]] +subtitle="ephemeral systems" +dataset="fedora_rawhide_systems" +dataseries="release" +views=['line','stacked','share'] +extraselect="AND age=0" +filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +[[timeseries]] +subtitle="persistent systems" +dataset="fedora_rawhide_systems" +dataseries="release" +views=['line','stacked','share'] +extraselect="AND age>0" +filebase="$dataset-timeseries-$dataseries-$view-persistent" + +[[timeseries]] +dataset="fedora_rawhide_systems" +dataseries="age" +views=['share','stacked'] + +[[timeseries]] +dataset="fedora_rawhide_systems" +dataseries="arch" +views=['line','share'] +filebase="$dataset-timeseries-$dataseries-$view-all" + +[[timeseries]] +subtitle="ephemeral systems" +dataset="fedora_rawhide_systems" +dataseries="arch" +views=['line','share'] +extraselect="AND age=0" +filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +[[timeseries]] +subtitle="persistent systems" +dataset="fedora_rawhide_systems" +dataseries="arch" +views=['line','share'] +extraselect="AND age>0" +filebase="$dataset-timeseries-$dataseries-$view-persistent" + +[[timeseries]] +dataset="fedora_rawhide_systems" +dataseries="variant" +views=['line','share'] +filebase="$dataset-timeseries-$dataseries-$view-all" + +[[timeseries]] +subtitle="ephemeral systems" +dataset="fedora_rawhide_systems" +dataseries="variant" +views=['line','share'] +extraselect="AND age=0" +filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +[[timeseries]] +subtitle="persistent systems" +dataset="fedora_rawhide_systems" +dataseries="variant" +views=['line','share'] +extraselect="AND age>0" +filebase="$dataset-timeseries-$dataseries-$view-persistent" + + +[[timeseries]] +dataset="fedora_rawhide_containers" +dataseries="release" +views=['line','stacked','share'] +filebase="$dataset-timeseries-$dataseries-$view-all" + +[[timeseries]] +subtitle="ephemeral systems" +dataset="fedora_rawhide_containers" +dataseries="release" +views=['line','stacked','share'] +extraselect="AND age=0" +filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +[[timeseries]] +subtitle="persistent systems" +dataset="fedora_rawhide_containers" +dataseries="release" +views=['line','stacked','share'] +extraselect="AND age>0" +filebase="$dataset-timeseries-$dataseries-$view-persistent" + +[[timeseries]] +dataset="fedora_rawhide_containers" +dataseries="age" +views=['share','stacked'] +filebase="$dataset-timeseries-$dataseries-$view-all" + +[[timeseries]] +dataset="fedora_rawhide_containers" +dataseries="arch" +views=['line','share'] +filebase="$dataset-timeseries-$dataseries-$view-all" + +[[timeseries]] +subtitle="ephemeral systems" +dataset="fedora_rawhide_containers" +dataseries="arch" +views=['line','share'] +extraselect="AND age=0" +filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +[[timeseries]] +subtitle="persistent systems" +dataset="fedora_rawhide_containers" +dataseries="arch" +views=['line','share'] +extraselect="AND age>0" +filebase="$dataset-timeseries-$dataseries-$view-persistent" + +[[timeseries]] +dataset="fedora_rawhide_containers" +dataseries="variant" +views=['line','share'] +filebase="$dataset-timeseries-$dataseries-$view-all" + +[[timeseries]] +subtitle="ephemeral systems" +dataset="fedora_rawhide_containers" +dataseries="variant" +views=['line','share'] +extraselect="AND age=0" +filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +[[timeseries]] +subtitle="persistent systems" +dataset="fedora_rawhide_containers" +dataseries="variant" +views=['line','share'] +extraselect="AND age>0" +filebase="$dataset-timeseries-$dataseries-$view-persistent" + + +[[timeseries]] +dataset="epel" +dataseries="release" +views=['line','stacked','share'] +filebase="$dataset-timeseries-$dataseries-$view-all" + +[[timeseries]] +subtitle="ephemeral systems" +dataset="epel" +dataseries="release" +views=['line','stacked','share'] +extraselect="AND age=0" +filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +[[timeseries]] +subtitle="persistent systems" +dataset="epel" +dataseries="release" +views=['line','stacked','share'] +extraselect="AND age>0" +filebase="$dataset-timeseries-$dataseries-$view-persistent" + +[[timeseries]] +dataset="epel" +dataseries="age" +views=['share','stacked'] +filebase="$dataset-timeseries-$dataseries-$view-all" + +[[timeseries]] +dataset="epel" +dataseries="arch" +views=['line','share'] +filebase="$dataset-timeseries-$dataseries-$view-all" + +[[timeseries]] +subtitle="ephemeral systems" +dataset="epel" +dataseries="arch" +views=['line','share'] +extraselect="AND age=0" +filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +[[timeseries]] +subtitle="persistent systems" +dataset="epel" +dataseries="arch" +views=['line','share'] +extraselect="AND age>0" +filebase="$dataset-timeseries-$dataseries-$view-persistent" + +[[timeseries]] +dataset="epel" +dataseries="variant" +views=['line','share'] +filebase="$dataset-timeseries-$dataseries-$view-all" + +[[timeseries]] +subtitle="ephemeral systems" +dataset="epel" +dataseries="variant" +views=['line','share'] +extraselect="AND age=0" +filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +[[timeseries]] +subtitle="persistent systems" +dataset="epel" +dataseries="variant" +views=['line','share'] +extraselect="AND age>0" +filebase="$dataset-timeseries-$dataseries-$view-persistent" + diff --git a/run.sh b/run.sh index d19d5d5..71512cf 100755 --- a/run.sh +++ b/run.sh @@ -71,15 +71,13 @@ echo -n "* Sorting the eggs... " echo " binaried." echo "* Creating cages for different exhibits..." - for dataset in $(echo 'SELECT DISTINCT(passel) FROM checkins;' | sqlite3 ./db/bronto.db); do + for dataset in $(echo 'SELECT DISTINCT(dataset) FROM checkins;' | sqlite3 ./db/bronto.db); do mkdir -p images/{svg,png}/$dataset done echo " Built!" echo "* Drawing portraits from the fossilized remains... " - #LINES=$(ls csv/*.csv |wc -l) - # FIXME - LINES=30 + LINES=$(toml -dj < config.toml |jq '.timeseries[].views.value[].value'|wc -l) ./brontosaurus-plotter.py | pv -F " %p %e" -w60 -l -s $LINES > /dev/null if [[ $? != 0 ]]; then echo "! Oops." diff --git a/view-defaults.toml b/view-defaults.toml deleted file mode 100644 index bfe46d8..0000000 --- a/view-defaults.toml +++ /dev/null @@ -1,51 +0,0 @@ -ephemeral = "all" - -figsize = [16, 9] -dpi = 300 - -# todo: change the fedora start date to also 2021-01-01, after DevConf.cz 2022. - -startdate.fedora = '2020-04-27' # week of Fedora 32 release -startdate.epel = '2021-01-01' # DNF feature launched in 8.3 at end of 2020 - -waffle.columns = 50 -waffle.rows = 20 - -# hide below this percent -hidepercent = 0.2 - -# our palette -colors = [ - '#51a2da', - '#294172', - '#afea85', - '#db3279', - '#f5a326', - '#b193c8', - '#38bc3b', - '#3c6eb4', - '#eb7434', - '#603e79', - '#ffd117', - '#aad0ee', - '#101010', - '#535961', -] - -#timeseries.types.release = ["line", "stacked", "share"] -#timeseries.types.variant = ["stacked", "share"] -#timeseries.types.arch = ["line", "share"] -#timeseries.types.age = ["line", "share"] - -#timeseries.ephemeral.release = "combined" -#timeseries.ephemeral.variant = "separate" -#timeseries.ephemeral.arch = "combined" -#timeseries.ephemeral.age = "none" - -#epel.group.classic = "CentOS Linux" -#epel.variant_variants = [ {"Without CentOS Linux": "-classic" }] - -# TODO: Figure out an expressive way to do this. -#fedora.group.container = ["container","toolbox","snappy"] -#fedora.group.server = ["cloud","coreos","iot","server"] -#fedora.group.ostree = ["coreos","iot","kinoite","silverblue"] From 5115258961f3b622be8fff6993835a41443e4d19 Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jan 27 2022 23:15:39 +0000 Subject: [PATCH 2/49] wip. find the peak weeks. --- diff --git a/TODO.md b/TODO.md index 2f78dc7..a258cef 100644 --- a/TODO.md +++ b/TODO.md @@ -17,43 +17,7 @@ * for the slicer, put the groups in their definitions in the config.toml -* be smarter about which timeseries to make - - * age already includes 0 and 1-4, so having separate ephemeral/persistent - views isn't useful - - * age unstacked line chart isn't really useful -- skip. - - * arch stacked chart isn't super useful either -- share is best, line is - ok. - - few enough lines that we can probably put ephemeral and persistent - on same chart. - - * variant stacked also isn't useful -- share is best, line ok - - but ephemeral vs persistent is a nightmare! - - * for release, all three are good (but maybe present ephemeral and - persistent on same line chart?) - - * secondary timeline charts for variants: - - * epel variants without centos linux (or rhel?) - - * fedora variants with just: - * desktop variants - * server/cloud/iot variants - * labs (compneuro, design suite) - * the three above, grouped - * ostree vs non-ostree (summed!) - - * the grouped one for arch - - * So, that's: - - * age over time — share and stacked (no special handling for ephemeral) - * arch over time — share and line (ephemeral on same chart?) - * variant over time — share and line (ephemeral separate charts) - * release — line, share, stacked (ephemeral on same charts?) +* secondary timeline charts for variants: * variant variants! * epel without CentOS Linux @@ -81,9 +45,6 @@ point, not summed (because that's its most interesting!) * don't bother with ephemeral/persistent view (age view is enough) -* something is messed up with the old waffle chart code. throw away, start - again - * sanatize all values read from config.toml * useful waffle charts (show current week, maybe average last 2-4): @@ -97,11 +58,6 @@ * make animations by week of full [arch,variant,release] * maybe of the breakouts too? -* Instead of a hard-coded thing in the plotter, generalize the - table and column-name-to-human-term code. Could also be used for formatting - "Mate-Compiz" and the like. - - * change the timeseries "hide" to collect small things into "other" @@ -124,41 +80,17 @@ old systems dropping out and being replaced by new ones. (In the latter case, we have _fewer_ ephemerial systems than we are currently guessing.) -* clean up the in-triplicate writing for ephemeral, permanent, and all - -* add totals for the waffle charts - * skip waffle charts that will never be interesting -* figure out how to estimate chart time better - * once we have more than a year of data, start Fedora chart at 2021-01-01, same as epel, because that initial growth curve is not really representative of anything but upgrades and all the initial data therefore skewed -* add numeric labels to the waffle charts! ("1 square = nnn systems") - -* Add Rawhide as a separate table. Needs special handling because it's hard - to sort out development on a regular Fedora OS release vs actually running - Rawhide. - -* something to make colors consistent - -* Filtering out obviously ridiculous data should be done before - the "dicer" stage, because otherwise it balloons the dataset. * Change ./run.sh into a makefile, because old-school. - -* Rework it so temporary files go in tmpdirs and data goes in var or - something (configurable) - - -* Related todo: with the by-release graphs, stop after the release is - no longer current. - -* Bonus: separate graphs for "which variants tend to persist after EOL" +* Can we get anything interesting for "which variants tend to persist after EOL"? * import estimates from old data @@ -177,6 +109,7 @@ * fix the code in brotosaurus washer to merge '' to 'none' rather than just renaming (works now because there are no natural 'none' entries). + * map "unknown" to "generic" * instead of throwing away entries in the washing phase (especially those diff --git a/brontosaurus-fight.sh b/brontosaurus-fight.sh new file mode 100755 index 0000000..ba14da8 --- /dev/null +++ b/brontosaurus-fight.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# +# Create a view which only shows the weeks where each release +# is at its peak. If someone actually is Good At SQL, I would +# not mind help making this more clear. + +sqlite3 db/bronto.db << EOF + DROP VIEW IF EXISTS peak; + CREATE VIEW peak AS + SELECT checkins.week, + checkins.dataset, + checkins.release, + checkins.variant, + checkins.arch, + checkins.age, + checkins.hits + FROM checkins + INNER JOIN + (SELECT week,dataset,release,max(hits) + FROM (SELECT week,dataset,release,sum(hits) AS hits + FROM checkins + GROUP BY week,dataset,release + ORDER BY week) + GROUP BY dataset,release) AS peaks + ON peaks.week = checkins.week AND peaks.release = checkins.release; +EOF diff --git a/brontosaurus-plotter.py b/brontosaurus-plotter.py index c3aa3a8..5eb4401 100755 --- a/brontosaurus-plotter.py +++ b/brontosaurus-plotter.py @@ -156,17 +156,17 @@ def main(): colormappings = defaultdict(OrderedDict) database = sqlite3.connect(DATAFILE, detect_types=sqlite3.PARSE_DECLTYPES) - cursor = database.cursor() - + # cursor = database.cursor() + ''' for timeseries in config['timeseries']: params = config['timeseries_defaults'].copy() params.update(timeseries) query = f"""SELECT week, {params['dataseries']}, SUM(hits) as hits FROM checkins - WHERE dataset=\"{params['dataset']}\" + WHERE dataset =\"{params['dataset']}\" {params['extraselect']} - GROUP BY week,{params['dataseries']} + GROUP BY week, {params['dataseries']} ORDER BY week """ df = pd.read_sql_query(query, parse_dates='week', con=database) @@ -178,6 +178,24 @@ def main(): dataframe=df.pivot(index='week', columns=params['dataseries'], values='hits').astype("Int64"), ) + ''' + + # sorry about this. + # what it does is: find all the rows from the peak + # week for each release. + query = """SELECT checkins.week,checkins.dataset,checkins.release,checkins.variant,checkins.arch,checkins.age,checkins.hits from checkins INNER JOIN + (SELECT week,dataset,release,max(hits) + FROM (SELECT week,dataset,release,sum(hits) AS hits + FROM checkins + GROUP BY week,dataset,release + ORDER BY week) + GROUP BY dataset,release) AS peaks + ON peaks.week = checkins.week AND peaks.release = checkins.release + """ + + at_peak = pd.read_sql_query(query, parse_dates='week', con=database) + pd.set_option('display.max_rows', len(at_peak)) + print(at_peak) if __name__ == "__main__": diff --git a/run.sh b/run.sh index 71512cf..41cfa7d 100755 --- a/run.sh +++ b/run.sh @@ -62,6 +62,14 @@ echo -n "* Scrubbing off the dirt... " fi echo " shiny!" +echo -n "* Finding the strongest... " + ./brontosaurus-fight.sh + if [[ $? != 0 ]]; then + echo "! Oops." + exit 1 + fi +echo " rarrhhhhr!" + echo -n "* Sorting the eggs... " ./brontosaurus-egg-sorter.py if [[ $? != 0 ]]; then From c3865b17a3d6c30e145b0da5fc7ee1dfd5d515dd Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jan 28 2022 00:10:07 +0000 Subject: [PATCH 3/49] bar charts in progras --- diff --git a/TODO.md b/TODO.md index a258cef..018c7f8 100644 --- a/TODO.md +++ b/TODO.md @@ -17,6 +17,8 @@ * for the slicer, put the groups in their definitions in the config.toml +* better ordering + * secondary timeline charts for variants: * variant variants! @@ -53,7 +55,7 @@ * age [arch,variant,release] (sort age 4-0 instead of 0-4) * variant for arch (different shape for ephemeral) -* EPEL charts with names in labels need fewer columns! + * EPEL charts with names in labels need fewer columns! * make animations by week of full [arch,variant,release] * maybe of the breakouts too? @@ -109,7 +111,7 @@ * fix the code in brotosaurus washer to merge '' to 'none' rather than just renaming (works now because there are no natural 'none' entries). - + * map "unknown" to "generic" * instead of throwing away entries in the washing phase (especially those diff --git a/brontosaurus-fight.sh b/brontosaurus-fight.sh index ba14da8..5726335 100755 --- a/brontosaurus-fight.sh +++ b/brontosaurus-fight.sh @@ -22,5 +22,7 @@ sqlite3 db/bronto.db << EOF GROUP BY week,dataset,release ORDER BY week) GROUP BY dataset,release) AS peaks - ON peaks.week = checkins.week AND peaks.release = checkins.release; + ON peaks.week = checkins.week + AND peaks.dataset = checkins.dataset + AND peaks.release = checkins.release; EOF diff --git a/brontosaurus-plotter.py b/brontosaurus-plotter.py index 5eb4401..5248b6c 100755 --- a/brontosaurus-plotter.py +++ b/brontosaurus-plotter.py @@ -147,6 +147,106 @@ def graph_timeseries(config, colormappings, params, dataframe): print(f"{Template(params['filebase']).safe_substitute(madlibs)}.{ext}") +def graph_releasebars(config, colormappings, params, dataframe): + """Draws earch release in the set as a bar chart""" + + # If we find we have missing data, in the future: + # dataframe.resample('W-MON') + + dataset = params['dataset'] + dataseries = params['dataseries'] + + ################# + # Instead of this, accumulate anything more than 10 into "other" + # ... and do it _elsewhere_ (easier to do before pivot anyway!) + # + limit number of columns to 10 + other + + hidelist = dataframe.div(dataframe.sum( + axis=1), axis=0).max() < 0.2/100 + dataframe.drop(columns=(hidelist[hidelist == True].keys()), inplace=True) + + ################## + # our colors. + # the complication here is keeping the same color for the same label + # across multiple graphs! + cmap = m.colors.ListedColormap(get_colors(colormappings, config['colors'], + dataset, dataseries, list(dataframe.columns))) + + ################## + # and now.... graph it! + + # FIXME: this is ugly + startdate = config['startdate'][dataset.split('_', 1)[0]] + + for view in params['views']: + + match view: + case 'stacked': + df = dataframe[startdate:] + kind = 'bar' + colormap = cmap + case 'share': + df = dataframe[startdate:].div( + dataframe.sum(axis=1), axis=0)*100 + kind = 'bar' + colormap = cmap + + # Start the actual graph + graph = df.plot(figsize=config['figsize'], + colormap=colormap, kind=kind) + + # Labels and titles and stuff. + ax = plt.gca() + + handles, labels = ax.get_legend_handles_labels() + + # TODO: generalize this + if dataseries == 'age': + labels = list(map(config['age_labels'].get, labels)) + + if view == 'stacked': + handles[:] = handles[::-1] + labels[:] = labels[::-1] + + plt.legend(handles, labels, loc='center left', + bbox_to_anchor=(1.0, 0.5)) + + madlibs = {'dataseries': dataseries, + 'dataset': dataset, + 'view': view, + 'dataseries_label': config['dataseries_labels'][dataseries], + 'dataset_label': config['dataset_labels'][dataset], + 'view_label': config['view_labels'][view]} + + if 'title' in params: + plt.suptitle(Template(params['title']).safe_substitute(madlibs), + fontsize=24) + + # FIX: make work + if 'subtitle' in params: + graph.set_title( + Template(params['subtitle']).safe_substitute(madlibs), + fontsize=14) + + plt.autoscale(enable=True, axis='x', tight=True) + plt.autoscale(enable=True, axis='y', tight=False) + graph.set_ylim([0, None]) + graph.spines['right'].set_visible(False) + graph.spines['top'].set_visible(False) + sFormatter = m.ticker.ScalarFormatter() + sFormatter.set_scientific(False) + graph.yaxis.set_major_formatter(sFormatter) + # graph.xaxis.set_major_formatter(dates.DateFormatter('%b \'%y')) + graph.set_xlabel('') + + for ext in config['image_types']: + graph.figure.savefig(f"images/{ext}/{dataset}/{Template(params['filebase']).safe_substitute(madlibs)}.{ext}", + dpi=config['dpi'], bbox_inches="tight") + + plt.close(graph.figure) + print(f"{Template(params['filebase']).safe_substitute(madlibs)}.{ext}") + + ########################################### @@ -156,7 +256,7 @@ def main(): colormappings = defaultdict(OrderedDict) database = sqlite3.connect(DATAFILE, detect_types=sqlite3.PARSE_DECLTYPES) - # cursor = database.cursor() + ''' for timeseries in config['timeseries']: params = config['timeseries_defaults'].copy() @@ -180,23 +280,34 @@ def main(): ) ''' - # sorry about this. - # what it does is: find all the rows from the peak - # week for each release. - query = """SELECT checkins.week,checkins.dataset,checkins.release,checkins.variant,checkins.arch,checkins.age,checkins.hits from checkins INNER JOIN - (SELECT week,dataset,release,max(hits) - FROM (SELECT week,dataset,release,sum(hits) AS hits - FROM checkins - GROUP BY week,dataset,release - ORDER BY week) - GROUP BY dataset,release) AS peaks - ON peaks.week = checkins.week AND peaks.release = checkins.release - """ - - at_peak = pd.read_sql_query(query, parse_dates='week', con=database) - pd.set_option('display.max_rows', len(at_peak)) - print(at_peak) + for byrelease in config['byrelease']: + params = config['byrelease_defaults'].copy() + params.update(byrelease) + + query = f"""SELECT release, {params['dataseries']}, SUM(hits) as hits + FROM peak + WHERE dataset =\"{params['dataset']}\" + {params['extraselect']} + GROUP BY release, {params['dataseries']} + ORDER BY release + """ + df = pd.read_sql_query(query, con=database) + + graph_releasebars( + config=config, + colormappings=colormappings, + params=params, + dataframe=df.pivot(index='release', columns=params['dataseries'], + values='hits').astype("Int64"), + ) + +''' +### getting ahead of myself: this is for the waffle charts +query = """SELECT * from checkins where week >= (SELECT DISTINCT(week) FROM checkins ORDER BY week DESC LIMIT 4,1)""" +df = pd.read_sql_query(query, parse_dates='week', con=database) +df +''' if __name__ == "__main__": main() diff --git a/config.toml b/config.toml index a6c5c58..4bcf55e 100644 --- a/config.toml +++ b/config.toml @@ -87,351 +87,420 @@ views=['line','stacked','share'] extraselect="AND age=0" filebase="$dataset-timeseries-$dataseries-$view-ephemeral" -[[timeseries]] -subtitle="persistent systems" -dataset="fedora_updates_systems" -dataseries="release" -views=['line','stacked','share'] -extraselect="AND age>0" -filebase="$dataset-timeseries-$dataseries-$view-persistent" +# [[timeseries]] +# subtitle="persistent systems" +# dataset="fedora_updates_systems" +# dataseries="release" +# views=['line','stacked','share'] +# extraselect="AND age>0" +# filebase="$dataset-timeseries-$dataseries-$view-persistent" + +# [[timeseries]] +# dataset="fedora_updates_systems" +# dataseries="age" +# views=['share','stacked'] +# filebase="$dataset-timeseries-$dataseries-$view-all" + +# [[timeseries]] +# dataset="fedora_updates_systems" +# dataseries="arch" +# views=['line','share'] +# filebase="$dataset-timeseries-$dataseries-$view-all" + +# [[timeseries]] +# subtitle="ephemeral systems" +# dataset="fedora_updates_systems" +# dataseries="arch" +# views=['line','share'] +# extraselect="AND age=0" +# filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +# [[timeseries]] +# subtitle="persistent systems" +# dataset="fedora_updates_systems" +# dataseries="arch" +# views=['line','share'] +# extraselect="AND age>0" +# filebase="$dataset-timeseries-$dataseries-$view-persistent" + +# [[timeseries]] +# dataset="fedora_updates_systems" +# dataseries="variant" +# views=['line','share'] +# filebase="$dataset-timeseries-$dataseries-$view-all" + +# [[timeseries]] +# subtitle="ephemeral systems" +# dataset="fedora_updates_systems" +# dataseries="variant" +# views=['line','share'] +# extraselect="AND age=0" +# filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +# [[timeseries]] +# subtitle="persistent systems" +# dataset="fedora_updates_systems" +# dataseries="variant" +# views=['line','share'] +# extraselect="AND age>0" +# filebase="$dataset-timeseries-$dataseries-$view-persistent" + +# [[timeseries]] +# dataset="fedora_updates_containers" +# dataseries="release" +# views=['line','stacked','share'] +# filebase="$dataset-timeseries-$dataseries-$view-all" + +# [[timeseries]] +# subtitle="ephemeral systems" +# dataset="fedora_updates_containers" +# dataseries="release" +# views=['line','stacked','share'] +# extraselect="AND age=0" +# filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +# [[timeseries]] +# subtitle="persistent systems" +# dataset="fedora_updates_containers" +# dataseries="release" +# views=['line','stacked','share'] +# extraselect="AND age>0" +# filebase="$dataset-timeseries-$dataseries-$view-persistent" + +# [[timeseries]] +# dataset="fedora_updates_containers" +# dataseries="age" +# views=['share','stacked'] +# filebase="$dataset-timeseries-$dataseries-$view-all" + +# [[timeseries]] +# dataset="fedora_updates_containers" +# dataseries="arch" +# views=['line','share'] +# filebase="$dataset-timeseries-$dataseries-$view-all" + +# [[timeseries]] +# subtitle="ephemeral systems" +# dataset="fedora_updates_containers" +# dataseries="arch" +# views=['line','share'] +# extraselect="AND age=0" +# filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +# [[timeseries]] +# subtitle="persistent systems" +# dataset="fedora_updates_containers" +# dataseries="arch" +# views=['line','share'] +# extraselect="AND age>0" +# filebase="$dataset-timeseries-$dataseries-$view-persistent" + +# [[timeseries]] +# dataset="fedora_updates_containers" +# dataseries="variant" +# views=['line','share'] +# filebase="$dataset-timeseries-$dataseries-$view-all" + +# [[timeseries]] +# subtitle="ephemeral systems" +# dataset="fedora_updates_containers" +# dataseries="variant" +# views=['line','share'] +# extraselect="AND age=0" +# filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +# [[timeseries]] +# subtitle="persistent systems" +# dataset="fedora_updates_containers" +# dataseries="variant" +# views=['line','share'] +# extraselect="AND age>0" +# filebase="$dataset-timeseries-$dataseries-$view-persistent" + + +# [[timeseries]] +# dataset="fedora_rawhide_systems" +# dataseries="release" +# views=['line','stacked','share'] +# filebase="$dataset-timeseries-$dataseries-$view-all" + +# [[timeseries]] +# subtitle="ephemeral systems" +# dataset="fedora_rawhide_systems" +# dataseries="release" +# views=['line','stacked','share'] +# extraselect="AND age=0" +# filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +# [[timeseries]] +# subtitle="persistent systems" +# dataset="fedora_rawhide_systems" +# dataseries="release" +# views=['line','stacked','share'] +# extraselect="AND age>0" +# filebase="$dataset-timeseries-$dataseries-$view-persistent" + +# [[timeseries]] +# dataset="fedora_rawhide_systems" +# dataseries="age" +# views=['share','stacked'] + +# [[timeseries]] +# dataset="fedora_rawhide_systems" +# dataseries="arch" +# views=['line','share'] +# filebase="$dataset-timeseries-$dataseries-$view-all" + +# [[timeseries]] +# subtitle="ephemeral systems" +# dataset="fedora_rawhide_systems" +# dataseries="arch" +# views=['line','share'] +# extraselect="AND age=0" +# filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +# [[timeseries]] +# subtitle="persistent systems" +# dataset="fedora_rawhide_systems" +# dataseries="arch" +# views=['line','share'] +# extraselect="AND age>0" +# filebase="$dataset-timeseries-$dataseries-$view-persistent" + +# [[timeseries]] +# dataset="fedora_rawhide_systems" +# dataseries="variant" +# views=['line','share'] +# filebase="$dataset-timeseries-$dataseries-$view-all" + +# [[timeseries]] +# subtitle="ephemeral systems" +# dataset="fedora_rawhide_systems" +# dataseries="variant" +# views=['line','share'] +# extraselect="AND age=0" +# filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +# [[timeseries]] +# subtitle="persistent systems" +# dataset="fedora_rawhide_systems" +# dataseries="variant" +# views=['line','share'] +# extraselect="AND age>0" +# filebase="$dataset-timeseries-$dataseries-$view-persistent" + + +# [[timeseries]] +# dataset="fedora_rawhide_containers" +# dataseries="release" +# views=['line','stacked','share'] +# filebase="$dataset-timeseries-$dataseries-$view-all" + +# [[timeseries]] +# subtitle="ephemeral systems" +# dataset="fedora_rawhide_containers" +# dataseries="release" +# views=['line','stacked','share'] +# extraselect="AND age=0" +# filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +# [[timeseries]] +# subtitle="persistent systems" +# dataset="fedora_rawhide_containers" +# dataseries="release" +# views=['line','stacked','share'] +# extraselect="AND age>0" +# filebase="$dataset-timeseries-$dataseries-$view-persistent" + +# [[timeseries]] +# dataset="fedora_rawhide_containers" +# dataseries="age" +# views=['share','stacked'] +# filebase="$dataset-timeseries-$dataseries-$view-all" + +# [[timeseries]] +# dataset="fedora_rawhide_containers" +# dataseries="arch" +# views=['line','share'] +# filebase="$dataset-timeseries-$dataseries-$view-all" + +# [[timeseries]] +# subtitle="ephemeral systems" +# dataset="fedora_rawhide_containers" +# dataseries="arch" +# views=['line','share'] +# extraselect="AND age=0" +# filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +# [[timeseries]] +# subtitle="persistent systems" +# dataset="fedora_rawhide_containers" +# dataseries="arch" +# views=['line','share'] +# extraselect="AND age>0" +# filebase="$dataset-timeseries-$dataseries-$view-persistent" + +# [[timeseries]] +# dataset="fedora_rawhide_containers" +# dataseries="variant" +# views=['line','share'] +# filebase="$dataset-timeseries-$dataseries-$view-all" + +# [[timeseries]] +# subtitle="ephemeral systems" +# dataset="fedora_rawhide_containers" +# dataseries="variant" +# views=['line','share'] +# extraselect="AND age=0" +# filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +# [[timeseries]] +# subtitle="persistent systems" +# dataset="fedora_rawhide_containers" +# dataseries="variant" +# views=['line','share'] +# extraselect="AND age>0" +# filebase="$dataset-timeseries-$dataseries-$view-persistent" + + +# [[timeseries]] +# dataset="epel" +# dataseries="release" +# views=['line','stacked','share'] +# filebase="$dataset-timeseries-$dataseries-$view-all" + +# [[timeseries]] +# subtitle="ephemeral systems" +# dataset="epel" +# dataseries="release" +# views=['line','stacked','share'] +# extraselect="AND age=0" +# filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +# [[timeseries]] +# subtitle="persistent systems" +# dataset="epel" +# dataseries="release" +# views=['line','stacked','share'] +# extraselect="AND age>0" +# filebase="$dataset-timeseries-$dataseries-$view-persistent" + +# [[timeseries]] +# dataset="epel" +# dataseries="age" +# views=['share','stacked'] +# filebase="$dataset-timeseries-$dataseries-$view-all" + +# [[timeseries]] +# dataset="epel" +# dataseries="arch" +# views=['line','share'] +# filebase="$dataset-timeseries-$dataseries-$view-all" + +# [[timeseries]] +# subtitle="ephemeral systems" +# dataset="epel" +# dataseries="arch" +# views=['line','share'] +# extraselect="AND age=0" +# filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +# [[timeseries]] +# subtitle="persistent systems" +# dataset="epel" +# dataseries="arch" +# views=['line','share'] +# extraselect="AND age>0" +# filebase="$dataset-timeseries-$dataseries-$view-persistent" + +# [[timeseries]] +# dataset="epel" +# dataseries="variant" +# views=['line','share'] +# filebase="$dataset-timeseries-$dataseries-$view-all" + +# [[timeseries]] +# subtitle="ephemeral systems" +# dataset="epel" +# dataseries="variant" +# views=['line','share'] +# extraselect="AND age=0" +# filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +# [[timeseries]] +# subtitle="persistent systems" +# dataset="epel" +# dataseries="variant" +# views=['line','share'] +# extraselect="AND age>0" +# filebase="$dataset-timeseries-$dataseries-$view-persistent" + +[byrelease_defaults] +title="$dataset_label: $dataseries_label by release" +filebase="$dataset-byrelease-$dataseries-$view" +extraselect="" +views=['stacked','share'] -[[timeseries]] +[[byrelease]] dataset="fedora_updates_systems" dataseries="age" -views=['share','stacked'] -filebase="$dataset-timeseries-$dataseries-$view-all" - -[[timeseries]] -dataset="fedora_updates_systems" -dataseries="arch" -views=['line','share'] -filebase="$dataset-timeseries-$dataseries-$view-all" - -[[timeseries]] -subtitle="ephemeral systems" -dataset="fedora_updates_systems" -dataseries="arch" -views=['line','share'] -extraselect="AND age=0" -filebase="$dataset-timeseries-$dataseries-$view-ephemeral" -[[timeseries]] -subtitle="persistent systems" +[[byrelease]] dataset="fedora_updates_systems" dataseries="arch" -views=['line','share'] -extraselect="AND age>0" -filebase="$dataset-timeseries-$dataseries-$view-persistent" -[[timeseries]] +[[byrelease]] dataset="fedora_updates_systems" dataseries="variant" -views=['line','share'] -filebase="$dataset-timeseries-$dataseries-$view-all" -[[timeseries]] -subtitle="ephemeral systems" -dataset="fedora_updates_systems" -dataseries="variant" -views=['line','share'] -extraselect="AND age=0" -filebase="$dataset-timeseries-$dataseries-$view-ephemeral" -[[timeseries]] -subtitle="persistent systems" -dataset="fedora_updates_systems" -dataseries="variant" -views=['line','share'] -extraselect="AND age>0" -filebase="$dataset-timeseries-$dataseries-$view-persistent" - -[[timeseries]] -dataset="fedora_updates_containers" -dataseries="release" -views=['line','stacked','share'] -filebase="$dataset-timeseries-$dataseries-$view-all" - -[[timeseries]] -subtitle="ephemeral systems" -dataset="fedora_updates_containers" -dataseries="release" -views=['line','stacked','share'] -extraselect="AND age=0" -filebase="$dataset-timeseries-$dataseries-$view-ephemeral" - -[[timeseries]] -subtitle="persistent systems" -dataset="fedora_updates_containers" -dataseries="release" -views=['line','stacked','share'] -extraselect="AND age>0" -filebase="$dataset-timeseries-$dataseries-$view-persistent" - -[[timeseries]] +[[byrelease]] dataset="fedora_updates_containers" dataseries="age" -views=['share','stacked'] -filebase="$dataset-timeseries-$dataseries-$view-all" - -[[timeseries]] -dataset="fedora_updates_containers" -dataseries="arch" -views=['line','share'] -filebase="$dataset-timeseries-$dataseries-$view-all" - -[[timeseries]] -subtitle="ephemeral systems" -dataset="fedora_updates_containers" -dataseries="arch" -views=['line','share'] -extraselect="AND age=0" -filebase="$dataset-timeseries-$dataseries-$view-ephemeral" -[[timeseries]] -subtitle="persistent systems" +[[byrelease]] dataset="fedora_updates_containers" dataseries="arch" -views=['line','share'] -extraselect="AND age>0" -filebase="$dataset-timeseries-$dataseries-$view-persistent" - -[[timeseries]] -dataset="fedora_updates_containers" -dataseries="variant" -views=['line','share'] -filebase="$dataset-timeseries-$dataseries-$view-all" - -[[timeseries]] -subtitle="ephemeral systems" -dataset="fedora_updates_containers" -dataseries="variant" -views=['line','share'] -extraselect="AND age=0" -filebase="$dataset-timeseries-$dataseries-$view-ephemeral" -[[timeseries]] -subtitle="persistent systems" +[[byrelease]] dataset="fedora_updates_containers" dataseries="variant" -views=['line','share'] -extraselect="AND age>0" -filebase="$dataset-timeseries-$dataseries-$view-persistent" -[[timeseries]] -dataset="fedora_rawhide_systems" -dataseries="release" -views=['line','stacked','share'] -filebase="$dataset-timeseries-$dataseries-$view-all" - -[[timeseries]] -subtitle="ephemeral systems" -dataset="fedora_rawhide_systems" -dataseries="release" -views=['line','stacked','share'] -extraselect="AND age=0" -filebase="$dataset-timeseries-$dataseries-$view-ephemeral" - -[[timeseries]] -subtitle="persistent systems" -dataset="fedora_rawhide_systems" -dataseries="release" -views=['line','stacked','share'] -extraselect="AND age>0" -filebase="$dataset-timeseries-$dataseries-$view-persistent" - -[[timeseries]] +[[byrelease]] dataset="fedora_rawhide_systems" dataseries="age" -views=['share','stacked'] - -[[timeseries]] -dataset="fedora_rawhide_systems" -dataseries="arch" -views=['line','share'] -filebase="$dataset-timeseries-$dataseries-$view-all" - -[[timeseries]] -subtitle="ephemeral systems" -dataset="fedora_rawhide_systems" -dataseries="arch" -views=['line','share'] -extraselect="AND age=0" -filebase="$dataset-timeseries-$dataseries-$view-ephemeral" -[[timeseries]] -subtitle="persistent systems" +[[byrelease]] dataset="fedora_rawhide_systems" dataseries="arch" -views=['line','share'] -extraselect="AND age>0" -filebase="$dataset-timeseries-$dataseries-$view-persistent" - -[[timeseries]] -dataset="fedora_rawhide_systems" -dataseries="variant" -views=['line','share'] -filebase="$dataset-timeseries-$dataseries-$view-all" - -[[timeseries]] -subtitle="ephemeral systems" -dataset="fedora_rawhide_systems" -dataseries="variant" -views=['line','share'] -extraselect="AND age=0" -filebase="$dataset-timeseries-$dataseries-$view-ephemeral" -[[timeseries]] -subtitle="persistent systems" +[[byrelease]] dataset="fedora_rawhide_systems" dataseries="variant" -views=['line','share'] -extraselect="AND age>0" -filebase="$dataset-timeseries-$dataseries-$view-persistent" - -[[timeseries]] -dataset="fedora_rawhide_containers" -dataseries="release" -views=['line','stacked','share'] -filebase="$dataset-timeseries-$dataseries-$view-all" -[[timeseries]] -subtitle="ephemeral systems" -dataset="fedora_rawhide_containers" -dataseries="release" -views=['line','stacked','share'] -extraselect="AND age=0" -filebase="$dataset-timeseries-$dataseries-$view-ephemeral" - -[[timeseries]] -subtitle="persistent systems" -dataset="fedora_rawhide_containers" -dataseries="release" -views=['line','stacked','share'] -extraselect="AND age>0" -filebase="$dataset-timeseries-$dataseries-$view-persistent" - -[[timeseries]] +[[byrelease]] dataset="fedora_rawhide_containers" dataseries="age" -views=['share','stacked'] -filebase="$dataset-timeseries-$dataseries-$view-all" -[[timeseries]] +[[byrelease]] dataset="fedora_rawhide_containers" dataseries="arch" -views=['line','share'] -filebase="$dataset-timeseries-$dataseries-$view-all" - -[[timeseries]] -subtitle="ephemeral systems" -dataset="fedora_rawhide_containers" -dataseries="arch" -views=['line','share'] -extraselect="AND age=0" -filebase="$dataset-timeseries-$dataseries-$view-ephemeral" - -[[timeseries]] -subtitle="persistent systems" -dataset="fedora_rawhide_containers" -dataseries="arch" -views=['line','share'] -extraselect="AND age>0" -filebase="$dataset-timeseries-$dataseries-$view-persistent" - -[[timeseries]] -dataset="fedora_rawhide_containers" -dataseries="variant" -views=['line','share'] -filebase="$dataset-timeseries-$dataseries-$view-all" - -[[timeseries]] -subtitle="ephemeral systems" -dataset="fedora_rawhide_containers" -dataseries="variant" -views=['line','share'] -extraselect="AND age=0" -filebase="$dataset-timeseries-$dataseries-$view-ephemeral" -[[timeseries]] -subtitle="persistent systems" +[[byrelease]] dataset="fedora_rawhide_containers" dataseries="variant" -views=['line','share'] -extraselect="AND age>0" -filebase="$dataset-timeseries-$dataseries-$view-persistent" - - -[[timeseries]] -dataset="epel" -dataseries="release" -views=['line','stacked','share'] -filebase="$dataset-timeseries-$dataseries-$view-all" -[[timeseries]] -subtitle="ephemeral systems" -dataset="epel" -dataseries="release" -views=['line','stacked','share'] -extraselect="AND age=0" -filebase="$dataset-timeseries-$dataseries-$view-ephemeral" -[[timeseries]] -subtitle="persistent systems" -dataset="epel" -dataseries="release" -views=['line','stacked','share'] -extraselect="AND age>0" -filebase="$dataset-timeseries-$dataseries-$view-persistent" - -[[timeseries]] +[[byrelease]] dataset="epel" dataseries="age" -views=['share','stacked'] -filebase="$dataset-timeseries-$dataseries-$view-all" -[[timeseries]] -dataset="epel" -dataseries="arch" -views=['line','share'] -filebase="$dataset-timeseries-$dataseries-$view-all" - -[[timeseries]] -subtitle="ephemeral systems" +[[byrelease]] dataset="epel" dataseries="arch" -views=['line','share'] -extraselect="AND age=0" -filebase="$dataset-timeseries-$dataseries-$view-ephemeral" -[[timeseries]] -subtitle="persistent systems" -dataset="epel" -dataseries="arch" -views=['line','share'] -extraselect="AND age>0" -filebase="$dataset-timeseries-$dataseries-$view-persistent" - -[[timeseries]] +[[byrelease]] dataset="epel" dataseries="variant" -views=['line','share'] -filebase="$dataset-timeseries-$dataseries-$view-all" - -[[timeseries]] -subtitle="ephemeral systems" -dataset="epel" -dataseries="variant" -views=['line','share'] -extraselect="AND age=0" -filebase="$dataset-timeseries-$dataseries-$view-ephemeral" - -[[timeseries]] -subtitle="persistent systems" -dataset="epel" -dataseries="variant" -views=['line','share'] -extraselect="AND age>0" -filebase="$dataset-timeseries-$dataseries-$view-persistent" - From c669134faa31a88244db6891c98edf409cb7c387 Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Feb 04 2022 13:44:33 +0000 Subject: [PATCH 4/49] reenable timeseries views. still wip here... --- diff --git a/config.toml b/config.toml index 4bcf55e..5bdcb80 100644 --- a/config.toml +++ b/config.toml @@ -87,353 +87,353 @@ views=['line','stacked','share'] extraselect="AND age=0" filebase="$dataset-timeseries-$dataseries-$view-ephemeral" -# [[timeseries]] -# subtitle="persistent systems" -# dataset="fedora_updates_systems" -# dataseries="release" -# views=['line','stacked','share'] -# extraselect="AND age>0" -# filebase="$dataset-timeseries-$dataseries-$view-persistent" - -# [[timeseries]] -# dataset="fedora_updates_systems" -# dataseries="age" -# views=['share','stacked'] -# filebase="$dataset-timeseries-$dataseries-$view-all" - -# [[timeseries]] -# dataset="fedora_updates_systems" -# dataseries="arch" -# views=['line','share'] -# filebase="$dataset-timeseries-$dataseries-$view-all" - -# [[timeseries]] -# subtitle="ephemeral systems" -# dataset="fedora_updates_systems" -# dataseries="arch" -# views=['line','share'] -# extraselect="AND age=0" -# filebase="$dataset-timeseries-$dataseries-$view-ephemeral" - -# [[timeseries]] -# subtitle="persistent systems" -# dataset="fedora_updates_systems" -# dataseries="arch" -# views=['line','share'] -# extraselect="AND age>0" -# filebase="$dataset-timeseries-$dataseries-$view-persistent" - -# [[timeseries]] -# dataset="fedora_updates_systems" -# dataseries="variant" -# views=['line','share'] -# filebase="$dataset-timeseries-$dataseries-$view-all" - -# [[timeseries]] -# subtitle="ephemeral systems" -# dataset="fedora_updates_systems" -# dataseries="variant" -# views=['line','share'] -# extraselect="AND age=0" -# filebase="$dataset-timeseries-$dataseries-$view-ephemeral" - -# [[timeseries]] -# subtitle="persistent systems" -# dataset="fedora_updates_systems" -# dataseries="variant" -# views=['line','share'] -# extraselect="AND age>0" -# filebase="$dataset-timeseries-$dataseries-$view-persistent" - -# [[timeseries]] -# dataset="fedora_updates_containers" -# dataseries="release" -# views=['line','stacked','share'] -# filebase="$dataset-timeseries-$dataseries-$view-all" - -# [[timeseries]] -# subtitle="ephemeral systems" -# dataset="fedora_updates_containers" -# dataseries="release" -# views=['line','stacked','share'] -# extraselect="AND age=0" -# filebase="$dataset-timeseries-$dataseries-$view-ephemeral" - -# [[timeseries]] -# subtitle="persistent systems" -# dataset="fedora_updates_containers" -# dataseries="release" -# views=['line','stacked','share'] -# extraselect="AND age>0" -# filebase="$dataset-timeseries-$dataseries-$view-persistent" - -# [[timeseries]] -# dataset="fedora_updates_containers" -# dataseries="age" -# views=['share','stacked'] -# filebase="$dataset-timeseries-$dataseries-$view-all" - -# [[timeseries]] -# dataset="fedora_updates_containers" -# dataseries="arch" -# views=['line','share'] -# filebase="$dataset-timeseries-$dataseries-$view-all" - -# [[timeseries]] -# subtitle="ephemeral systems" -# dataset="fedora_updates_containers" -# dataseries="arch" -# views=['line','share'] -# extraselect="AND age=0" -# filebase="$dataset-timeseries-$dataseries-$view-ephemeral" - -# [[timeseries]] -# subtitle="persistent systems" -# dataset="fedora_updates_containers" -# dataseries="arch" -# views=['line','share'] -# extraselect="AND age>0" -# filebase="$dataset-timeseries-$dataseries-$view-persistent" - -# [[timeseries]] -# dataset="fedora_updates_containers" -# dataseries="variant" -# views=['line','share'] -# filebase="$dataset-timeseries-$dataseries-$view-all" - -# [[timeseries]] -# subtitle="ephemeral systems" -# dataset="fedora_updates_containers" -# dataseries="variant" -# views=['line','share'] -# extraselect="AND age=0" -# filebase="$dataset-timeseries-$dataseries-$view-ephemeral" - -# [[timeseries]] -# subtitle="persistent systems" -# dataset="fedora_updates_containers" -# dataseries="variant" -# views=['line','share'] -# extraselect="AND age>0" -# filebase="$dataset-timeseries-$dataseries-$view-persistent" - - -# [[timeseries]] -# dataset="fedora_rawhide_systems" -# dataseries="release" -# views=['line','stacked','share'] -# filebase="$dataset-timeseries-$dataseries-$view-all" - -# [[timeseries]] -# subtitle="ephemeral systems" -# dataset="fedora_rawhide_systems" -# dataseries="release" -# views=['line','stacked','share'] -# extraselect="AND age=0" -# filebase="$dataset-timeseries-$dataseries-$view-ephemeral" - -# [[timeseries]] -# subtitle="persistent systems" -# dataset="fedora_rawhide_systems" -# dataseries="release" -# views=['line','stacked','share'] -# extraselect="AND age>0" -# filebase="$dataset-timeseries-$dataseries-$view-persistent" - -# [[timeseries]] -# dataset="fedora_rawhide_systems" -# dataseries="age" -# views=['share','stacked'] - -# [[timeseries]] -# dataset="fedora_rawhide_systems" -# dataseries="arch" -# views=['line','share'] -# filebase="$dataset-timeseries-$dataseries-$view-all" - -# [[timeseries]] -# subtitle="ephemeral systems" -# dataset="fedora_rawhide_systems" -# dataseries="arch" -# views=['line','share'] -# extraselect="AND age=0" -# filebase="$dataset-timeseries-$dataseries-$view-ephemeral" - -# [[timeseries]] -# subtitle="persistent systems" -# dataset="fedora_rawhide_systems" -# dataseries="arch" -# views=['line','share'] -# extraselect="AND age>0" -# filebase="$dataset-timeseries-$dataseries-$view-persistent" - -# [[timeseries]] -# dataset="fedora_rawhide_systems" -# dataseries="variant" -# views=['line','share'] -# filebase="$dataset-timeseries-$dataseries-$view-all" - -# [[timeseries]] -# subtitle="ephemeral systems" -# dataset="fedora_rawhide_systems" -# dataseries="variant" -# views=['line','share'] -# extraselect="AND age=0" -# filebase="$dataset-timeseries-$dataseries-$view-ephemeral" - -# [[timeseries]] -# subtitle="persistent systems" -# dataset="fedora_rawhide_systems" -# dataseries="variant" -# views=['line','share'] -# extraselect="AND age>0" -# filebase="$dataset-timeseries-$dataseries-$view-persistent" - - -# [[timeseries]] -# dataset="fedora_rawhide_containers" -# dataseries="release" -# views=['line','stacked','share'] -# filebase="$dataset-timeseries-$dataseries-$view-all" - -# [[timeseries]] -# subtitle="ephemeral systems" -# dataset="fedora_rawhide_containers" -# dataseries="release" -# views=['line','stacked','share'] -# extraselect="AND age=0" -# filebase="$dataset-timeseries-$dataseries-$view-ephemeral" - -# [[timeseries]] -# subtitle="persistent systems" -# dataset="fedora_rawhide_containers" -# dataseries="release" -# views=['line','stacked','share'] -# extraselect="AND age>0" -# filebase="$dataset-timeseries-$dataseries-$view-persistent" - -# [[timeseries]] -# dataset="fedora_rawhide_containers" -# dataseries="age" -# views=['share','stacked'] -# filebase="$dataset-timeseries-$dataseries-$view-all" - -# [[timeseries]] -# dataset="fedora_rawhide_containers" -# dataseries="arch" -# views=['line','share'] -# filebase="$dataset-timeseries-$dataseries-$view-all" - -# [[timeseries]] -# subtitle="ephemeral systems" -# dataset="fedora_rawhide_containers" -# dataseries="arch" -# views=['line','share'] -# extraselect="AND age=0" -# filebase="$dataset-timeseries-$dataseries-$view-ephemeral" - -# [[timeseries]] -# subtitle="persistent systems" -# dataset="fedora_rawhide_containers" -# dataseries="arch" -# views=['line','share'] -# extraselect="AND age>0" -# filebase="$dataset-timeseries-$dataseries-$view-persistent" - -# [[timeseries]] -# dataset="fedora_rawhide_containers" -# dataseries="variant" -# views=['line','share'] -# filebase="$dataset-timeseries-$dataseries-$view-all" - -# [[timeseries]] -# subtitle="ephemeral systems" -# dataset="fedora_rawhide_containers" -# dataseries="variant" -# views=['line','share'] -# extraselect="AND age=0" -# filebase="$dataset-timeseries-$dataseries-$view-ephemeral" - -# [[timeseries]] -# subtitle="persistent systems" -# dataset="fedora_rawhide_containers" -# dataseries="variant" -# views=['line','share'] -# extraselect="AND age>0" -# filebase="$dataset-timeseries-$dataseries-$view-persistent" - - -# [[timeseries]] -# dataset="epel" -# dataseries="release" -# views=['line','stacked','share'] -# filebase="$dataset-timeseries-$dataseries-$view-all" - -# [[timeseries]] -# subtitle="ephemeral systems" -# dataset="epel" -# dataseries="release" -# views=['line','stacked','share'] -# extraselect="AND age=0" -# filebase="$dataset-timeseries-$dataseries-$view-ephemeral" - -# [[timeseries]] -# subtitle="persistent systems" -# dataset="epel" -# dataseries="release" -# views=['line','stacked','share'] -# extraselect="AND age>0" -# filebase="$dataset-timeseries-$dataseries-$view-persistent" - -# [[timeseries]] -# dataset="epel" -# dataseries="age" -# views=['share','stacked'] -# filebase="$dataset-timeseries-$dataseries-$view-all" - -# [[timeseries]] -# dataset="epel" -# dataseries="arch" -# views=['line','share'] -# filebase="$dataset-timeseries-$dataseries-$view-all" - -# [[timeseries]] -# subtitle="ephemeral systems" -# dataset="epel" -# dataseries="arch" -# views=['line','share'] -# extraselect="AND age=0" -# filebase="$dataset-timeseries-$dataseries-$view-ephemeral" - -# [[timeseries]] -# subtitle="persistent systems" -# dataset="epel" -# dataseries="arch" -# views=['line','share'] -# extraselect="AND age>0" -# filebase="$dataset-timeseries-$dataseries-$view-persistent" - -# [[timeseries]] -# dataset="epel" -# dataseries="variant" -# views=['line','share'] -# filebase="$dataset-timeseries-$dataseries-$view-all" - -# [[timeseries]] -# subtitle="ephemeral systems" -# dataset="epel" -# dataseries="variant" -# views=['line','share'] -# extraselect="AND age=0" -# filebase="$dataset-timeseries-$dataseries-$view-ephemeral" - -# [[timeseries]] -# subtitle="persistent systems" -# dataset="epel" -# dataseries="variant" -# views=['line','share'] -# extraselect="AND age>0" -# filebase="$dataset-timeseries-$dataseries-$view-persistent" +[[timeseries]] +subtitle="persistent systems" +dataset="fedora_updates_systems" +dataseries="release" +views=['line','stacked','share'] +extraselect="AND age>0" +filebase="$dataset-timeseries-$dataseries-$view-persistent" + +[[timeseries]] +dataset="fedora_updates_systems" +dataseries="age" +views=['share','stacked'] +filebase="$dataset-timeseries-$dataseries-$view-all" + +[[timeseries]] +dataset="fedora_updates_systems" +dataseries="arch" +views=['line','share'] +filebase="$dataset-timeseries-$dataseries-$view-all" + +[[timeseries]] +subtitle="ephemeral systems" +dataset="fedora_updates_systems" +dataseries="arch" +views=['line','share'] +extraselect="AND age=0" +filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +[[timeseries]] +subtitle="persistent systems" +dataset="fedora_updates_systems" +dataseries="arch" +views=['line','share'] +extraselect="AND age>0" +filebase="$dataset-timeseries-$dataseries-$view-persistent" + +[[timeseries]] +dataset="fedora_updates_systems" +dataseries="variant" +views=['line','share'] +filebase="$dataset-timeseries-$dataseries-$view-all" + +[[timeseries]] +subtitle="ephemeral systems" +dataset="fedora_updates_systems" +dataseries="variant" +views=['line','share'] +extraselect="AND age=0" +filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +[[timeseries]] +subtitle="persistent systems" +dataset="fedora_updates_systems" +dataseries="variant" +views=['line','share'] +extraselect="AND age>0" +filebase="$dataset-timeseries-$dataseries-$view-persistent" + +[[timeseries]] +dataset="fedora_updates_containers" +dataseries="release" +views=['line','stacked','share'] +filebase="$dataset-timeseries-$dataseries-$view-all" + +[[timeseries]] +subtitle="ephemeral systems" +dataset="fedora_updates_containers" +dataseries="release" +views=['line','stacked','share'] +extraselect="AND age=0" +filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +[[timeseries]] +subtitle="persistent systems" +dataset="fedora_updates_containers" +dataseries="release" +views=['line','stacked','share'] +extraselect="AND age>0" +filebase="$dataset-timeseries-$dataseries-$view-persistent" + +[[timeseries]] +dataset="fedora_updates_containers" +dataseries="age" +views=['share','stacked'] +filebase="$dataset-timeseries-$dataseries-$view-all" + +[[timeseries]] +dataset="fedora_updates_containers" +dataseries="arch" +views=['line','share'] +filebase="$dataset-timeseries-$dataseries-$view-all" + +[[timeseries]] +subtitle="ephemeral systems" +dataset="fedora_updates_containers" +dataseries="arch" +views=['line','share'] +extraselect="AND age=0" +filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +[[timeseries]] +subtitle="persistent systems" +dataset="fedora_updates_containers" +dataseries="arch" +views=['line','share'] +extraselect="AND age>0" +filebase="$dataset-timeseries-$dataseries-$view-persistent" + +[[timeseries]] +dataset="fedora_updates_containers" +dataseries="variant" +views=['line','share'] +filebase="$dataset-timeseries-$dataseries-$view-all" + +[[timeseries]] +subtitle="ephemeral systems" +dataset="fedora_updates_containers" +dataseries="variant" +views=['line','share'] +extraselect="AND age=0" +filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +[[timeseries]] +subtitle="persistent systems" +dataset="fedora_updates_containers" +dataseries="variant" +views=['line','share'] +extraselect="AND age>0" +filebase="$dataset-timeseries-$dataseries-$view-persistent" + + +[[timeseries]] +dataset="fedora_rawhide_systems" +dataseries="release" +views=['line','stacked','share'] +filebase="$dataset-timeseries-$dataseries-$view-all" + +[[timeseries]] +subtitle="ephemeral systems" +dataset="fedora_rawhide_systems" +dataseries="release" +views=['line','stacked','share'] +extraselect="AND age=0" +filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +[[timeseries]] +subtitle="persistent systems" +dataset="fedora_rawhide_systems" +dataseries="release" +views=['line','stacked','share'] +extraselect="AND age>0" +filebase="$dataset-timeseries-$dataseries-$view-persistent" + +[[timeseries]] +dataset="fedora_rawhide_systems" +dataseries="age" +views=['share','stacked'] + +[[timeseries]] +dataset="fedora_rawhide_systems" +dataseries="arch" +views=['line','share'] +filebase="$dataset-timeseries-$dataseries-$view-all" + +[[timeseries]] +subtitle="ephemeral systems" +dataset="fedora_rawhide_systems" +dataseries="arch" +views=['line','share'] +extraselect="AND age=0" +filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +[[timeseries]] +subtitle="persistent systems" +dataset="fedora_rawhide_systems" +dataseries="arch" +views=['line','share'] +extraselect="AND age>0" +filebase="$dataset-timeseries-$dataseries-$view-persistent" + +[[timeseries]] +dataset="fedora_rawhide_systems" +dataseries="variant" +views=['line','share'] +filebase="$dataset-timeseries-$dataseries-$view-all" + +[[timeseries]] +subtitle="ephemeral systems" +dataset="fedora_rawhide_systems" +dataseries="variant" +views=['line','share'] +extraselect="AND age=0" +filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +[[timeseries]] +subtitle="persistent systems" +dataset="fedora_rawhide_systems" +dataseries="variant" +views=['line','share'] +extraselect="AND age>0" +filebase="$dataset-timeseries-$dataseries-$view-persistent" + + +[[timeseries]] +dataset="fedora_rawhide_containers" +dataseries="release" +views=['line','stacked','share'] +filebase="$dataset-timeseries-$dataseries-$view-all" + +[[timeseries]] +subtitle="ephemeral systems" +dataset="fedora_rawhide_containers" +dataseries="release" +views=['line','stacked','share'] +extraselect="AND age=0" +filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +[[timeseries]] +subtitle="persistent systems" +dataset="fedora_rawhide_containers" +dataseries="release" +views=['line','stacked','share'] +extraselect="AND age>0" +filebase="$dataset-timeseries-$dataseries-$view-persistent" + +[[timeseries]] +dataset="fedora_rawhide_containers" +dataseries="age" +views=['share','stacked'] +filebase="$dataset-timeseries-$dataseries-$view-all" + +[[timeseries]] +dataset="fedora_rawhide_containers" +dataseries="arch" +views=['line','share'] +filebase="$dataset-timeseries-$dataseries-$view-all" + +[[timeseries]] +subtitle="ephemeral systems" +dataset="fedora_rawhide_containers" +dataseries="arch" +views=['line','share'] +extraselect="AND age=0" +filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +[[timeseries]] +subtitle="persistent systems" +dataset="fedora_rawhide_containers" +dataseries="arch" +views=['line','share'] +extraselect="AND age>0" +filebase="$dataset-timeseries-$dataseries-$view-persistent" + +[[timeseries]] +dataset="fedora_rawhide_containers" +dataseries="variant" +views=['line','share'] +filebase="$dataset-timeseries-$dataseries-$view-all" + +[[timeseries]] +subtitle="ephemeral systems" +dataset="fedora_rawhide_containers" +dataseries="variant" +views=['line','share'] +extraselect="AND age=0" +filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +[[timeseries]] +subtitle="persistent systems" +dataset="fedora_rawhide_containers" +dataseries="variant" +views=['line','share'] +extraselect="AND age>0" +filebase="$dataset-timeseries-$dataseries-$view-persistent" + + +[[timeseries]] +dataset="epel" +dataseries="release" +views=['line','stacked','share'] +filebase="$dataset-timeseries-$dataseries-$view-all" + +[[timeseries]] +subtitle="ephemeral systems" +dataset="epel" +dataseries="release" +views=['line','stacked','share'] +extraselect="AND age=0" +filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +[[timeseries]] +subtitle="persistent systems" +dataset="epel" +dataseries="release" +views=['line','stacked','share'] +extraselect="AND age>0" +filebase="$dataset-timeseries-$dataseries-$view-persistent" + +[[timeseries]] +dataset="epel" +dataseries="age" +views=['share','stacked'] +filebase="$dataset-timeseries-$dataseries-$view-all" + +[[timeseries]] +dataset="epel" +dataseries="arch" +views=['line','share'] +filebase="$dataset-timeseries-$dataseries-$view-all" + +[[timeseries]] +subtitle="ephemeral systems" +dataset="epel" +dataseries="arch" +views=['line','share'] +extraselect="AND age=0" +filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +[[timeseries]] +subtitle="persistent systems" +dataset="epel" +dataseries="arch" +views=['line','share'] +extraselect="AND age>0" +filebase="$dataset-timeseries-$dataseries-$view-persistent" + +[[timeseries]] +dataset="epel" +dataseries="variant" +views=['line','share'] +filebase="$dataset-timeseries-$dataseries-$view-all" + +[[timeseries]] +subtitle="ephemeral systems" +dataset="epel" +dataseries="variant" +views=['line','share'] +extraselect="AND age=0" +filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +[[timeseries]] +subtitle="persistent systems" +dataset="epel" +dataseries="variant" +views=['line','share'] +extraselect="AND age>0" +filebase="$dataset-timeseries-$dataseries-$view-persistent" [byrelease_defaults] title="$dataset_label: $dataseries_label by release" From 16b06b5efd7e815003ef92025b0318789a525f3b Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Feb 04 2022 16:18:05 +0000 Subject: [PATCH 5/49] "actually" --- diff --git a/brontosaurus-plotter.py b/brontosaurus-plotter.py index 5248b6c..060a83c 100755 --- a/brontosaurus-plotter.py +++ b/brontosaurus-plotter.py @@ -257,7 +257,6 @@ def main(): database = sqlite3.connect(DATAFILE, detect_types=sqlite3.PARSE_DECLTYPES) - ''' for timeseries in config['timeseries']: params = config['timeseries_defaults'].copy() params.update(timeseries) @@ -278,7 +277,6 @@ def main(): dataframe=df.pivot(index='week', columns=params['dataseries'], values='hits').astype("Int64"), ) - ''' for byrelease in config['byrelease']: params = config['byrelease_defaults'].copy() From c6874ad1d155c92e9c7ba3c987e3c83a927e0671 Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Feb 04 2022 17:19:52 +0000 Subject: [PATCH 6/49] subtitle note --- diff --git a/config.toml b/config.toml index 5bdcb80..cf42e07 100644 --- a/config.toml +++ b/config.toml @@ -437,6 +437,7 @@ filebase="$dataset-timeseries-$dataseries-$view-persistent" [byrelease_defaults] title="$dataset_label: $dataseries_label by release" +subtitle="data for each release taken from the week of that release's peak" filebase="$dataset-byrelease-$dataseries-$view" extraselect="" views=['stacked','share'] From b6e465eb034dd71a30d19dd8a024d11b009bda17 Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Mar 03 2022 14:58:25 +0000 Subject: [PATCH 7/49] note on epel 8 graphs --- diff --git a/TODO.md b/TODO.md index 018c7f8..4135926 100644 --- a/TODO.md +++ b/TODO.md @@ -1,3 +1,6 @@ +* epel -- need to special-case EL 8 by-release graphs to add peak _after_ + CentOS Linux 8 EOL + * text reports!!! * this week / last week / year-over-year * total systems / total persistent / total ephemeral (+%) From 8f5e0dcbacade800b6fc249212d732adcd76ea04 Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: May 26 2022 14:26:46 +0000 Subject: [PATCH 8/49] change start date for Fedora stats to January 2021 because apparent "increase" before that is just DNF Countme enablement ramp up --- diff --git a/TODO.md b/TODO.md index 4135926..48e8f16 100644 --- a/TODO.md +++ b/TODO.md @@ -87,12 +87,6 @@ * skip waffle charts that will never be interesting -* once we have more than a year of data, start Fedora chart at 2021-01-01, - same as epel, because that initial growth curve is not really - representative of anything but upgrades and all the initial data - therefore skewed - - * Change ./run.sh into a makefile, because old-school. * Can we get anything interesting for "which variants tend to persist after EOL"? diff --git a/brontosaurus-plotter.py b/brontosaurus-plotter.py index 060a83c..f569540 100755 --- a/brontosaurus-plotter.py +++ b/brontosaurus-plotter.py @@ -177,6 +177,7 @@ def graph_releasebars(config, colormappings, params, dataframe): # FIXME: this is ugly startdate = config['startdate'][dataset.split('_', 1)[0]] + startrelease = config['startrelease'][dataset.split('_', 1)[0]] for view in params['views']: diff --git a/brontosaurus-washer.sh b/brontosaurus-washer.sh index 8dad9ef..b655cb7 100755 --- a/brontosaurus-washer.sh +++ b/brontosaurus-washer.sh @@ -31,11 +31,9 @@ EOF # While some test systems ran Fedora Linux 31, the feature landed -# in 32 (released 2020-04-27, so drop all the old stuff. Note -# that the Plan is to change this to the same as STARTDAY, and -# actually backfill even for F32 with velociraptorizer data. +# in 32 (released 2020-04-27, so drop all the old stuff. FEDORA_STARTVER=32 -FEDORA_STARTDAY='2020-04-27' +FEDORA_STARTDAY='2021-01-01' # And same for EPEL, with EL 8 (DNF feature launched in 8.3 at end of 2020) EPEL_STARTVER=8 EPEL_STARTDAY='2021-01-01' diff --git a/config.toml b/config.toml index cf42e07..cf4308a 100644 --- a/config.toml +++ b/config.toml @@ -30,9 +30,8 @@ colors = [ # TODO: not yet implemented image_types = ["png"] -# todo: change the fedora start date to also 2021-01-01, after DevConf.cz 2022. [startdate] -fedora = '2020-04-27' # week of Fedora 32 release +fedora = '2021-01-01' # F32 release not fully captured, so start here. epel = '2021-01-01' # DNF feature launched in 8.3 at end of 2020 [dataset_labels] From daa17bcd3f6d8096e6eb7566450795b227f6660b Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: May 31 2022 15:10:31 +0000 Subject: [PATCH 9/49] todo note :) --- diff --git a/TODO.md b/TODO.md index 48e8f16..24222e4 100644 --- a/TODO.md +++ b/TODO.md @@ -34,6 +34,8 @@ * arch variants: * desktop,server +* Report estimating new installs vs upgrades (number of systems older than + the release itself ... need to factor in beta releaes date, etc....) * I guess we should make it so the timeseries definitions can loop over multiple datasets to avoid a lot of redundancy. Or at least, to apply to all Fedora datasets? (Yes, that: introduce a "distro" grouping.) From bfff4fe72d066be191b2365dd2c4d286e3b7e57a Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 19 2022 16:11:32 +0000 Subject: [PATCH 10/49] stacked bar graphs work now --- diff --git a/brontosaurus-plotter.py b/brontosaurus-plotter.py index f569540..e048f62 100755 --- a/brontosaurus-plotter.py +++ b/brontosaurus-plotter.py @@ -148,7 +148,7 @@ def graph_timeseries(config, colormappings, params, dataframe): def graph_releasebars(config, colormappings, params, dataframe): - """Draws earch release in the set as a bar chart""" + """Draws each release in the set as a bar chart""" # If we find we have missing data, in the future: # dataframe.resample('W-MON') @@ -177,7 +177,6 @@ def graph_releasebars(config, colormappings, params, dataframe): # FIXME: this is ugly startdate = config['startdate'][dataset.split('_', 1)[0]] - startrelease = config['startrelease'][dataset.split('_', 1)[0]] for view in params['views']: @@ -192,9 +191,9 @@ def graph_releasebars(config, colormappings, params, dataframe): kind = 'bar' colormap = cmap - # Start the actual graph + # Start the actual graph graph = df.plot(figsize=config['figsize'], - colormap=colormap, kind=kind) + colormap=colormap, kind=kind, stacked=True) # Labels and titles and stuff. ax = plt.gca() @@ -258,47 +257,49 @@ def main(): database = sqlite3.connect(DATAFILE, detect_types=sqlite3.PARSE_DECLTYPES) - for timeseries in config['timeseries']: - params = config['timeseries_defaults'].copy() - params.update(timeseries) - - query = f"""SELECT week, {params['dataseries']}, SUM(hits) as hits - FROM checkins - WHERE dataset =\"{params['dataset']}\" - {params['extraselect']} - GROUP BY week, {params['dataseries']} - ORDER BY week - """ - df = pd.read_sql_query(query, parse_dates='week', con=database) - - graph_timeseries( - config=config, - colormappings=colormappings, - params=params, - dataframe=df.pivot(index='week', columns=params['dataseries'], - values='hits').astype("Int64"), - ) - - for byrelease in config['byrelease']: - params = config['byrelease_defaults'].copy() - params.update(byrelease) - - query = f"""SELECT release, {params['dataseries']}, SUM(hits) as hits - FROM peak - WHERE dataset =\"{params['dataset']}\" - {params['extraselect']} - GROUP BY release, {params['dataseries']} - ORDER BY release - """ - df = pd.read_sql_query(query, con=database) - - graph_releasebars( - config=config, - colormappings=colormappings, - params=params, - dataframe=df.pivot(index='release', columns=params['dataseries'], - values='hits').astype("Int64"), - ) + if 'timeseries' in config: + for timeseries in config['timeseries']: + params = config['timeseries_defaults'].copy() + params.update(timeseries) + + query = f"""SELECT week, {params['dataseries']}, SUM(hits) as hits + FROM checkins + WHERE dataset =\"{params['dataset']}\" + {params['extraselect']} + GROUP BY week, {params['dataseries']} + ORDER BY week + """ + df = pd.read_sql_query(query, parse_dates='week', con=database) + + graph_timeseries( + config=config, + colormappings=colormappings, + params=params, + dataframe=df.pivot(index='week', columns=params['dataseries'], + values='hits').astype("Int64"), + ) + + if 'byrelease' in config: + for byrelease in config['byrelease']: + params = config['byrelease_defaults'].copy() + params.update(byrelease) + + query = f"""SELECT release, {params['dataseries']}, SUM(hits) as hits + FROM peak + WHERE dataset =\"{params['dataset']}\" + {params['extraselect']} + GROUP BY release, {params['dataseries']} + ORDER BY release + """ + df = pd.read_sql_query(query, con=database) + + graph_releasebars( + config=config, + colormappings=colormappings, + params=params, + dataframe=df.pivot(index='release', columns=params['dataseries'], + values='hits').astype("Int64"), + ) ''' diff --git a/config.toml b/config.toml index cf4308a..1da7e54 100644 --- a/config.toml +++ b/config.toml @@ -436,7 +436,7 @@ filebase="$dataset-timeseries-$dataseries-$view-persistent" [byrelease_defaults] title="$dataset_label: $dataseries_label by release" -subtitle="data for each release taken from the week of that release's peak" +subtitle="data for each release taken from the week of that release's (current) peak" filebase="$dataset-byrelease-$dataseries-$view" extraselect="" views=['stacked','share'] @@ -448,11 +448,35 @@ dataseries="age" [[byrelease]] dataset="fedora_updates_systems" dataseries="arch" +filebase="$dataset-timeseries-$dataseries-$view-all" + +[[byrelease]] +dataset="fedora_updates_systems" +dataseries="arch" +extraselect="AND age=0" +filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +[[byrelease]] +dataset="fedora_updates_systems" +dataseries="arch" +extraselect="AND age>0" +filebase="$dataset-timeseries-$dataseries-$view-persistent" [[byrelease]] dataset="fedora_updates_systems" dataseries="variant" +[[byrelease]] +dataset="fedora_updates_systems" +dataseries="variant" +extraselect="AND age=0" +filebase="$dataset-timeseries-$dataseries-$view-ephemeral" + +[[byrelease]] +dataset="fedora_updates_systems" +dataseries="variant" +extraselect="AND age>0" +filebase="$dataset-timeseries-$dataseries-$view-persistent" [[byrelease]] dataset="fedora_updates_containers" @@ -501,6 +525,7 @@ dataseries="age" dataset="epel" dataseries="arch" + [[byrelease]] dataset="epel" dataseries="variant" From bb3331f97903bd226f6e068e68674f2d93cce322 Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 19 2022 16:21:30 +0000 Subject: [PATCH 11/49] generic --- diff --git a/TODO.md b/TODO.md index 24222e4..72eed3a 100644 --- a/TODO.md +++ b/TODO.md @@ -111,7 +111,7 @@ * fix the code in brotosaurus washer to merge '' to 'none' rather than just renaming (works now because there are no natural 'none' entries). -* map "unknown" to "generic" +* map "generic" and "unknown" and "none" to "unspecified" * instead of throwing away entries in the washing phase (especially those below thresholds), write them to a special db for "fun" analysis From 406764a117e1b37ffb5da030756f7c90e0eea658 Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 19 2022 20:47:04 +0000 Subject: [PATCH 12/49] NEW PLAN --- diff --git a/TODO.md b/TODO.md index 72eed3a..6a891c4 100644 --- a/TODO.md +++ b/TODO.md @@ -1,3 +1,16 @@ +* new plan! + + 1. Save the color mappings to a file as a separate step + * using defaults from config + 2. change brontosaurus-plotter to render _one_ image per call + * with a syntax for what to include or exclude by name + * and possibly with some number options? + 3. make timeline, releasebar, and waffle be separate commands + 4. have some script that pre-renders some defaults + 5. and a simple front-end for exploring the rest + +* put the dataset date in the filename! + * epel -- need to special-case EL 8 by-release graphs to add peak _after_ CentOS Linux 8 EOL From a76eacd736b445f5af9492df98e7fefd4288d405 Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 22 2022 15:23:16 +0000 Subject: [PATCH 13/49] oops filenames --- diff --git a/config.toml b/config.toml index 1da7e54..fb53367 100644 --- a/config.toml +++ b/config.toml @@ -448,19 +448,19 @@ dataseries="age" [[byrelease]] dataset="fedora_updates_systems" dataseries="arch" -filebase="$dataset-timeseries-$dataseries-$view-all" +filebase="$dataset-byrelease-$dataseries-$view-all" [[byrelease]] dataset="fedora_updates_systems" dataseries="arch" extraselect="AND age=0" -filebase="$dataset-timeseries-$dataseries-$view-ephemeral" +filebase="$dataset-byrelease-$dataseries-$view-ephemeral" [[byrelease]] dataset="fedora_updates_systems" dataseries="arch" extraselect="AND age>0" -filebase="$dataset-timeseries-$dataseries-$view-persistent" +filebase="$dataset-byrelease-$dataseries-$view-persistent" [[byrelease]] dataset="fedora_updates_systems" @@ -470,13 +470,13 @@ dataseries="variant" dataset="fedora_updates_systems" dataseries="variant" extraselect="AND age=0" -filebase="$dataset-timeseries-$dataseries-$view-ephemeral" +filebase="$dataset-byrelease-$dataseries-$view-ephemeral" [[byrelease]] dataset="fedora_updates_systems" dataseries="variant" extraselect="AND age>0" -filebase="$dataset-timeseries-$dataseries-$view-persistent" +filebase="$dataset-byrelease-$dataseries-$view-persistent" [[byrelease]] dataset="fedora_updates_containers" From 3bbff3d4ab55faf74e6bdc0f9fbf2a96f13b152a Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 23 2022 21:53:20 +0000 Subject: [PATCH 14/49] separate out color caching --- diff --git a/.gitignore b/.gitignore index 50d0c76..c2165cb 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ **/.~lock*# db/* images/* -.ipynb_checkpoints/* \ No newline at end of file +.ipynb_checkpoints/* +__pycache__ \ No newline at end of file diff --git a/TODO.md b/TODO.md index 6a891c4..533cc45 100644 --- a/TODO.md +++ b/TODO.md @@ -1,7 +1,7 @@ * new plan! 1. Save the color mappings to a file as a separate step - * using defaults from config + * using defaults from config (DONE) 2. change brontosaurus-plotter to render _one_ image per call * with a syntax for what to include or exclude by name * and possibly with some number options? diff --git a/brontosaurus-plotter.py b/brontosaurus-plotter.py index e048f62..29ca465 100755 --- a/brontosaurus-plotter.py +++ b/brontosaurus-plotter.py @@ -6,9 +6,6 @@ import matplotlib.pyplot as plt import sqlite3 from string import Template -from collections import defaultdict -from collections import OrderedDict - from pprint import pprint import pandas as pd @@ -16,6 +13,8 @@ import toml import matplotlib as m +from brontosaurus_colorizer import load_color_cache, get_colors + DATAFILE = 'db/bronto.db' m.use("Agg") @@ -26,21 +25,6 @@ m.rcParams['font.family'] = 'Montserrat' m.rcParams['legend.frameon'] = False -def get_colors(colormappings, colorlist, dataset, dataseries, items): - """This makes colors 'sticky' for the whole run.""" - - key = dataset + '_' + dataseries - - # for each label item, assign the next color in the colorlist - # and save that for later. - outcolors = [] - for item in items: - if item not in colormappings[key]: - colormappings[key][item] = colorlist[len( - colormappings[key]) % len(colorlist)] - outcolors.append(colormappings[key][item]) - - return outcolors def graph_timeseries(config, colormappings, params, dataframe): @@ -253,7 +237,9 @@ def graph_releasebars(config, colormappings, params, dataframe): def main(): config = toml.load("config.toml") - colormappings = defaultdict(OrderedDict) + + + colormappings = load_color_cache(config['color_cache'],config['color_presets']) database = sqlite3.connect(DATAFILE, detect_types=sqlite3.PARSE_DECLTYPES) diff --git a/brontosaurus_colorizer.py b/brontosaurus_colorizer.py new file mode 100755 index 0000000..878f4dc --- /dev/null +++ b/brontosaurus_colorizer.py @@ -0,0 +1,114 @@ +#!/usr/bin/python3 +""" +This script goes through the database and pre-sets colors for each combination. + +This is saved to +""" + +from pprint import pprint +import sqlite3 + +from collections import defaultdict +from collections import OrderedDict + + +import toml +import re + + + + +def get_colors(colormappings, colorlist, dataset, dataseries, items): + """This makes colors 'sticky' for the whole run.""" + + key = dataset + '.' + dataseries + + # for each label item, assign the next color in the colorlist + # and save that for later. + outcolors = [] + for item in items: + if item not in colormappings[key]: + colormappings[key][str(item)] = colorlist[len(colormappings[key]) % len(colorlist)] + outcolors.append(colormappings[key][str(item)]) + + return outcolors + + +def load_color_cache(cachefile,presetfile): + + colormappings = defaultdict(OrderedDict) + try: + cached = toml.load(cachefile) + except FileNotFoundError: + print(f"Can't open color cache {cachefile}, so starting fresh.") + cached = {} + + try: + presets = toml.load(presetfile) + cached.update(presets) + except FileNotFoundError: + print(f"No color preset file {presetfile} found.") + + + # gotta do this because we want a defaultdict but + # toml load just gives us a regular dict. + for key in cached.keys(): + colormappings[key] = cached[key].copy() + + return colormappings + + +def main(): + + config = toml.load("config.toml") + + colormappings = load_color_cache(config['color_cache'],config['color_presets']) + + database = sqlite3.connect(config['datafile'], detect_types=sqlite3.PARSE_DECLTYPES) + cur = database.cursor() + + + cur.execute("SELECT dataset FROM checkins GROUP BY dataset") + datasets = [t[0] for t in cur.fetchall()] + cur.execute("SELECT * FROM checkins LIMIT 1").fetchall() + dataserieses = [t[0] for t in cur.description] + dataserieses.remove('week') + dataserieses.remove('dataset') + dataserieses.remove('hits') + + + + for dataset in datasets: + if not re.match('^[0-9a-z_]*$', dataset): + print(f"Bad dataset name! '%{dataset}") + exit(1) + for dataseries in dataserieses: + if not re.match('^[0-9a-z_]*$', dataseries): + print(f"Bad dataseries name! '%{dataseries}") + exit(1) + + if dataseries == 'age': + order="" + elif dataseries == 'release': + order="ORDER BY release DESC" + else: + order='ORDER BY total DESC' + query = f"""SELECT {dataseries},sum(hits) AS TOTAL + FROM checkins + WHERE dataset = '{dataset}' + GROUP BY {dataseries} + {order} + """ + cur.execute(query) + items = [t[0] for t in cur.fetchall()] + #print(dataset,dataseries,items) + + get_colors(colormappings, config['colors'], dataset, dataseries, items) + + + with open(config['color_cache'], "w") as toml_file: + toml.dump(colormappings, toml_file) + + +if __name__ == "__main__": + main() diff --git a/color-presets.toml b/color-presets.toml new file mode 100644 index 0000000..dd54edb --- /dev/null +++ b/color-presets.toml @@ -0,0 +1,8 @@ +["epel.variant"] +"CentOS Linux" = "#101010" +"Red Hat Enterprise Linux" = "#ee0000" +"CentOS Stream" = "#a14a8c" +"Rocky Linux" = "#10b981" +"AlmaLinux" = "#ffcc0a" +"Oracle Linux Server" = "#aaaaaa" +"CloudLinux" = "#0097f3" diff --git a/config.toml b/config.toml index fb53367..01163b4 100644 --- a/config.toml +++ b/config.toml @@ -1,3 +1,8 @@ +datafile = "db/bronto.db" + +color_presets = "color-presets.toml" +color_cache = "db/color-cache.toml" + ephemeral = "all" figsize = [16, 9] @@ -24,6 +29,7 @@ colors = [ '#aad0ee', '#101010', '#535961', + '#808080', ] # could be png, pdf, svg diff --git a/run.sh b/run.sh index 41cfa7d..09b84b5 100755 --- a/run.sh +++ b/run.sh @@ -84,6 +84,11 @@ echo "* Creating cages for different exhibits..." done echo " Built!" +echo "* Painting the feathers..." + rm db/color-cache.toml 2> /dev/null + ./brontosaurus-colorizer.py +echo " Vibrant!" + echo "* Drawing portraits from the fossilized remains... " LINES=$(toml -dj < config.toml |jq '.timeseries[].views.value[].value'|wc -l) ./brontosaurus-plotter.py | pv -F " %p %e" -w60 -l -s $LINES > /dev/null From d1c81e9c8ae9178b789224edefd0d082a314ecbd Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 23 2022 21:55:01 +0000 Subject: [PATCH 15/49] underscore instead of hyphen --- diff --git a/brontosaurus-egg-sorter.py b/brontosaurus-egg-sorter.py deleted file mode 100755 index e8dff2e..0000000 --- a/brontosaurus-egg-sorter.py +++ /dev/null @@ -1,277 +0,0 @@ -#!/usr/bin/python3 - -# Written by Matthew Miller - -# This estimates the number of systems of age 1 which turn into long-running -# systems and which are just ephemeral test installs, ci system instances, -# builders, etc., and relabels the latter as "age=0". -# -# The basic concept is: a system reporting as "one" could either be -# persistent (that is, goes into "two" next week) or ephemeral (doesn't). -# Any increase in group two [across versions] _must_ be from systems coming -# from group one. Therefore, at least that number of systems from "one" must -# be "persistent". However, that can undercount if systems from group two go -# offline — it may be that there are more persistent systems than we -# thought. -# -# We can mitigate this a little bit by also considering the flow to groups -# three and four, working backwards, like this: -# -# one,two,three,four = this weeks age values -# one1,two1,three1,four1 = next week's age values -# -# # If four1-four is negative, older more systems are going offline then -# # aging into it. Increases must come from group three, or ghosts -# # (ignored; see below). There may be _more_ from three to four, propping -# # up the value as other systems go offline, but it can't be more than -# # the total increase -# min_flow_to_four = max(four1-four,0) # time only goes forward -# max_flow_to_four = min(three,four1) # could be _total turnover_ -# -# # Without any flow from two, three would go down by the range above. -# # How much _did_ we go down by (if any?) Anything above that must -# # be flow from two (or ghosts!) -# min_flow_to_three = max(three1-three,0) + min_flow_to_four -# # max is what's there in the next week _plus_ what could have -# # moved on. -# max_flow_to_three = min(two,three1 + max_flow_to_four) -# -# # Same deal, but one cohort over... -# min_flow_to_two = max(two1-two,0) + min_flow_to_three -# max_flow_to_two = min(one,two1 + max_flow_to_three) -# -# # Leaving us with ... -# -# min_ephemeral (zero) = one - max_flow_to_two -# min_persistent (one) = min(one,min_flow_to_two) -# -# max_ephemeral (zero) = one - min(one,min_flow_to_two) -# max_persistent (one) = max_flow_to_two -# -# # split? -# -# or -# -# moved_up=min(one,max(two1-two,0) + max(three1-three,0) + max(four1-four,0)) -# zero=one-moved_up -# one=moved_up -# -# *or* -# -# moved_up=min(one,two1 + min(two,three1 + min(three,four1))) -# zero=one-moved_up -# one=moved_up -# -# Of course, this assumes that random new systems won't show up in -# later groups — ghosts! Right now, I'm assuming they're rare enough -# to ignore for the purpose of this estimation. Theoretically, ghost -# systems mean that the minimum estimate is actually too high. -# -# Also: since age 1 is 1 week, all systems must move up (or vanish). -# Age 2 is 3 weeks (weeks 2,3,4), so assuming that the average number -# of new permanant installations is roughly smooth week to week (no -# huge jumps) on some days, that means we can assume turnover of 1/3 for -# that group. Age 3 is 20 weeks (weeks 5-24, inclusive), so turnover should -# be much smaller — more like 1/20! -# -# Also. this it keeps track of the percentage of a given system type that is -# ephemeral, and uses that to guess for missing values (like, the last week -# in the dataset, where there is no "next week" yet.) -# -# To consider: bias towards upgrades when a new release is just out? or does -# that get us too much seeing what we want to see? We could deterimine this -# by the first week systems seen breaks some threshold, or more cleverly -# by noticing when the curve jumps. -# - -import os -import sys -import string -import sqlite3 -import datetime -from collections import Counter -from tokenize import group - -DATAFILE = 'db/bronto.db' - -DATABASE = sqlite3.connect(DATAFILE, detect_types=sqlite3.PARSE_DECLTYPES) - -loopcursor = DATABASE.cursor() -nextcursor = DATABASE.cursor() - -zerocounter = Counter() -onecounter = Counter() - - -loopcursor.execute( - "SELECT DISTINCT(dataset) FROM checkins ORDER BY dataset DESC") -groups = [item for sublist in loopcursor.fetchall() for item in sublist] - - -for group in groups: - zerocounter.clear() - onecounter.clear() - - loopcursor.execute( - 'SELECT * FROM checkins WHERE dataset = :dataset AND age = 1 ORDER BY week', {'dataset': group}) - for row in loopcursor: - (week, group, release, variant, arch, age, hits) = row - - thisone = hits - - # get the other age groups for this type of system, if any - query = """SELECT age,hits FROM checkins WHERE - week = :week AND - dataset = :dataset AND - release = :release AND - variant = :variant AND - arch = :arch AND - age > 1 - ORDER BY age - """ - nextcursor.execute(query, - {"week": week, - "dataset": group, - "release": release, - "variant": variant, - "arch": arch - }) - thisweek = nextcursor.fetchall() - twothis = 0 - threethis = 0 - fourthis = 0 - if thisweek: - for agegroup in thisweek: - (age, hits) = agegroup - if age == 2: - twothis = hits - elif age == 3: - threethis = hits - elif age == 4: - fourthis = hits - else: - raise ValueError() - - # Get the age groups for next week (if any) for this type of system - # For Fedora Linux, we're also including higher release numbers -- - # systems could be upgraded! However, that's unlikely for epel. So.. - if group.split('_', 1)[0] == 'fedora': - query = """SELECT age,sum(hits) FROM checkins WHERE - week = :nextweek AND - dataset = :dataset AND - release >= :release AND - variant = :variant AND - arch = :arch AND - age > 1 - GROUP BY age - ORDER BY age""" - else: - query = """SELECT age,sum(hits) FROM checkins WHERE - week = :nextweek AND - dataset = :dataset AND - release = :release AND - variant = :variant AND - arch = :arch AND - age > 1 - GROUP BY age - ORDER BY age""" - nextcursor.execute(query, - {"nextweek": (datetime.datetime.fromisoformat(week) + datetime.timedelta(weeks=1)).strftime("%Y-%m-%d"), # this was easier when it was weeknums! - "dataset": group, - "release": release, - "variant": variant, - "arch": arch - }) - nextweek = nextcursor.fetchall() - - if nextweek: - - twonext = 0 - threenext = 0 - fournext = 0 - - for agegroup in nextweek: - (age, hits) = agegroup - if age == 2: - twonext = hits - elif age == 3: - threenext = hits - elif age == 4: - fournext = hits - else: - raise ValueError(f"age is {age}") - - # okay, whew. see long comment at top of file for explanation of the theory. - # in practice, there are these three possibilities: - # moved_up_min = max(twonext-twothis, 0) + - # max(threenext - threethis, 0) + - # max(fournext-fourthis, 0) - # moved_up_max = twonext + min(twothis, threenext + min(threethis, fournext)) - # - # moved_up_timebased = int(twothis/3 + threethis/20 + [some estimate of dropout rate of fourthis]) - # - # So what I'm doing here is for each period, going with time-based capped by - # the min and max flow for that age group. And then for age 4, arbitrarily picking - # a ratio. Note that min and timebased separate easily, but the max is difficult - # - # For now, we're assuming _minimum_ flow from age 3 into age 4. - # Why? This makes the bands for age 1 and age 2 look most reasonable. - moved_to_four = max(fournext-fourthis, 0) - moved_to_three = min(max(threethis/20, max(threenext-threethis, 0)+moved_to_four), - min(twothis, threenext + moved_to_four)) - moved_to_two = min(max( - twothis/3, max(twonext-twothis, 0) + moved_to_three), min(thisone, twonext + moved_to_three)) - moved_up = min(thisone, int(moved_to_two)) - new_zero = thisone - moved_up - assert(new_zero >= 0) - new_one = moved_up - assert (new_one == thisone-new_zero) - - # keep a running total for the estimate - zerocounter[(release, variant, arch)] += new_zero - onecounter[(release, variant, arch)] += new_one - - else: # no values for next week, so... estimate! - totalprevious = zerocounter[( - release, variant, arch)] + onecounter[(release, variant, arch)] - if totalprevious: - new_zero = round( - thisone*zerocounter[(release, variant, arch)]/totalprevious) - new_one = thisone-new_zero - else: - # no estimate for this row, so assume all ephemeral - new_zero = thisone - new_one = 0 - - assert new_zero + \ - new_one == thisone, "{} + {} = {}".format( - new_zero, new_one, thisone) - - nextcursor.execute("""INSERT INTO checkins - (week, dataset, release, variant, arch, age, hits) - VALUES (:week, :dataset, :release, :variant, :arch, :age, :hits)""", - {"week": week, - "dataset": group, - "release": release, - "variant": variant, - "arch": arch, - "age": 0, - "hits": new_zero - }) - nextcursor.execute("""REPLACE INTO checkins - (week, dataset, release, variant, arch, age, hits) - VALUES (:week, :dataset, :release, :variant, :arch, :age, :hits)""", - {"week": week, - "dataset": group, - "release": release, - "variant": variant, - "arch": arch, - "age": 1, - "hits": new_one - }) - - # these are just clutter, and it's easier to zap them at the end - # than to avoid making them in the loop. - loopcursor.execute(f"""DELETE from checkins WHERE hits=0""") - - DATABASE.commit() diff --git a/brontosaurus-fight.sh b/brontosaurus-fight.sh deleted file mode 100755 index 5726335..0000000 --- a/brontosaurus-fight.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash -# -# Create a view which only shows the weeks where each release -# is at its peak. If someone actually is Good At SQL, I would -# not mind help making this more clear. - -sqlite3 db/bronto.db << EOF - DROP VIEW IF EXISTS peak; - CREATE VIEW peak AS - SELECT checkins.week, - checkins.dataset, - checkins.release, - checkins.variant, - checkins.arch, - checkins.age, - checkins.hits - FROM checkins - INNER JOIN - (SELECT week,dataset,release,max(hits) - FROM (SELECT week,dataset,release,sum(hits) AS hits - FROM checkins - GROUP BY week,dataset,release - ORDER BY week) - GROUP BY dataset,release) AS peaks - ON peaks.week = checkins.week - AND peaks.dataset = checkins.dataset - AND peaks.release = checkins.release; -EOF diff --git a/brontosaurus-plotter.py b/brontosaurus-plotter.py deleted file mode 100755 index 29ca465..0000000 --- a/brontosaurus-plotter.py +++ /dev/null @@ -1,299 +0,0 @@ -#!/usr/bin/python3 - -import matplotlib.dates as dates -import matplotlib.pyplot as plt - -import sqlite3 -from string import Template - -from pprint import pprint - -import pandas as pd -import toml - -import matplotlib as m - -from brontosaurus_colorizer import load_color_cache, get_colors - -DATAFILE = 'db/bronto.db' - -m.use("Agg") - -m.style.use('seaborn-colorblind') -m.rcParams['font.size'] = 12 -m.rcParams['font.family'] = 'Montserrat' -m.rcParams['legend.frameon'] = False - - - - -def graph_timeseries(config, colormappings, params, dataframe): - """Draws line or area chart for a dataseries over time.""" - - # If we find we have missing data, in the future: - # dataframe.resample('W-MON') - - dataset = params['dataset'] - dataseries = params['dataseries'] - - ################# - # Instead of this, accumulate anything more than 10 into "other" - # ... and do it _elsewhere_ (easier to do before pivot anyway!) - # + limit number of columns to 10 + other - - hidelist = dataframe.div(dataframe.sum( - axis=1), axis=0).max() < 0.2/100 - dataframe.drop(columns=(hidelist[hidelist == True].keys()), inplace=True) - - ################## - # our colors. - # the complication here is keeping the same color for the same label - # across multiple graphs! - cmap = m.colors.ListedColormap(get_colors(colormappings, config['colors'], - dataset, dataseries, list(dataframe.columns))) - - ################## - # and now.... graph it! - - # FIXME: this is ugly - startdate = config['startdate'][dataset.split('_', 1)[0]] - - for view in params['views']: - - match view: - case 'line': - df = dataframe[startdate:] - kind = 'line' - colormap = cmap - case 'stacked': - df = dataframe[startdate:][dataframe.columns[::-1]] - kind = 'area' - colormap = m.colors.ListedColormap(cmap.colors[::-1]) - case 'share': - df = dataframe[startdate:].div( - dataframe.sum(axis=1), axis=0)*100 - kind = 'area' - colormap = cmap - - # Start the actual graph - graph = df.plot(figsize=config['figsize'], - colormap=colormap, kind=kind) - - # Labels and titles and stuff. - ax = plt.gca() - - handles, labels = ax.get_legend_handles_labels() - - # TODO: generalize this - if dataseries == 'age': - labels = list(map(config['age_labels'].get, labels)) - - if view == 'stacked': - handles[:] = handles[::-1] - labels[:] = labels[::-1] - - plt.legend(handles, labels, loc='center left', - bbox_to_anchor=(1.0, 0.5)) - - madlibs = {'dataseries': dataseries, - 'dataset': dataset, - 'view': view, - 'dataseries_label': config['dataseries_labels'][dataseries], - 'dataset_label': config['dataset_labels'][dataset], - 'view_label': config['view_labels'][view]} - - if 'title' in params: - plt.suptitle(Template(params['title']).safe_substitute(madlibs), - fontsize=24) - - # FIX: make work - if 'subtitle' in params: - graph.set_title( - Template(params['subtitle']).safe_substitute(madlibs), - fontsize=14) - - plt.autoscale(enable=True, axis='x', tight=True) - plt.autoscale(enable=True, axis='y', tight=False) - graph.set_ylim([0, None]) - graph.spines['right'].set_visible(False) - graph.spines['top'].set_visible(False) - sFormatter = m.ticker.ScalarFormatter() - sFormatter.set_scientific(False) - graph.yaxis.set_major_formatter(sFormatter) - # graph.xaxis.set_major_formatter(dates.DateFormatter('%b \'%y')) - graph.set_xlabel('') - - for ext in config['image_types']: - graph.figure.savefig(f"images/{ext}/{dataset}/{Template(params['filebase']).safe_substitute(madlibs)}.{ext}", - dpi=config['dpi'], bbox_inches="tight") - - plt.close(graph.figure) - print(f"{Template(params['filebase']).safe_substitute(madlibs)}.{ext}") - - -def graph_releasebars(config, colormappings, params, dataframe): - """Draws each release in the set as a bar chart""" - - # If we find we have missing data, in the future: - # dataframe.resample('W-MON') - - dataset = params['dataset'] - dataseries = params['dataseries'] - - ################# - # Instead of this, accumulate anything more than 10 into "other" - # ... and do it _elsewhere_ (easier to do before pivot anyway!) - # + limit number of columns to 10 + other - - hidelist = dataframe.div(dataframe.sum( - axis=1), axis=0).max() < 0.2/100 - dataframe.drop(columns=(hidelist[hidelist == True].keys()), inplace=True) - - ################## - # our colors. - # the complication here is keeping the same color for the same label - # across multiple graphs! - cmap = m.colors.ListedColormap(get_colors(colormappings, config['colors'], - dataset, dataseries, list(dataframe.columns))) - - ################## - # and now.... graph it! - - # FIXME: this is ugly - startdate = config['startdate'][dataset.split('_', 1)[0]] - - for view in params['views']: - - match view: - case 'stacked': - df = dataframe[startdate:] - kind = 'bar' - colormap = cmap - case 'share': - df = dataframe[startdate:].div( - dataframe.sum(axis=1), axis=0)*100 - kind = 'bar' - colormap = cmap - - # Start the actual graph - graph = df.plot(figsize=config['figsize'], - colormap=colormap, kind=kind, stacked=True) - - # Labels and titles and stuff. - ax = plt.gca() - - handles, labels = ax.get_legend_handles_labels() - - # TODO: generalize this - if dataseries == 'age': - labels = list(map(config['age_labels'].get, labels)) - - if view == 'stacked': - handles[:] = handles[::-1] - labels[:] = labels[::-1] - - plt.legend(handles, labels, loc='center left', - bbox_to_anchor=(1.0, 0.5)) - - madlibs = {'dataseries': dataseries, - 'dataset': dataset, - 'view': view, - 'dataseries_label': config['dataseries_labels'][dataseries], - 'dataset_label': config['dataset_labels'][dataset], - 'view_label': config['view_labels'][view]} - - if 'title' in params: - plt.suptitle(Template(params['title']).safe_substitute(madlibs), - fontsize=24) - - # FIX: make work - if 'subtitle' in params: - graph.set_title( - Template(params['subtitle']).safe_substitute(madlibs), - fontsize=14) - - plt.autoscale(enable=True, axis='x', tight=True) - plt.autoscale(enable=True, axis='y', tight=False) - graph.set_ylim([0, None]) - graph.spines['right'].set_visible(False) - graph.spines['top'].set_visible(False) - sFormatter = m.ticker.ScalarFormatter() - sFormatter.set_scientific(False) - graph.yaxis.set_major_formatter(sFormatter) - # graph.xaxis.set_major_formatter(dates.DateFormatter('%b \'%y')) - graph.set_xlabel('') - - for ext in config['image_types']: - graph.figure.savefig(f"images/{ext}/{dataset}/{Template(params['filebase']).safe_substitute(madlibs)}.{ext}", - dpi=config['dpi'], bbox_inches="tight") - - plt.close(graph.figure) - print(f"{Template(params['filebase']).safe_substitute(madlibs)}.{ext}") - - -########################################### - - -def main(): - - config = toml.load("config.toml") - - - colormappings = load_color_cache(config['color_cache'],config['color_presets']) - - database = sqlite3.connect(DATAFILE, detect_types=sqlite3.PARSE_DECLTYPES) - - if 'timeseries' in config: - for timeseries in config['timeseries']: - params = config['timeseries_defaults'].copy() - params.update(timeseries) - - query = f"""SELECT week, {params['dataseries']}, SUM(hits) as hits - FROM checkins - WHERE dataset =\"{params['dataset']}\" - {params['extraselect']} - GROUP BY week, {params['dataseries']} - ORDER BY week - """ - df = pd.read_sql_query(query, parse_dates='week', con=database) - - graph_timeseries( - config=config, - colormappings=colormappings, - params=params, - dataframe=df.pivot(index='week', columns=params['dataseries'], - values='hits').astype("Int64"), - ) - - if 'byrelease' in config: - for byrelease in config['byrelease']: - params = config['byrelease_defaults'].copy() - params.update(byrelease) - - query = f"""SELECT release, {params['dataseries']}, SUM(hits) as hits - FROM peak - WHERE dataset =\"{params['dataset']}\" - {params['extraselect']} - GROUP BY release, {params['dataseries']} - ORDER BY release - """ - df = pd.read_sql_query(query, con=database) - - graph_releasebars( - config=config, - colormappings=colormappings, - params=params, - dataframe=df.pivot(index='release', columns=params['dataseries'], - values='hits').astype("Int64"), - ) - - -''' -### getting ahead of myself: this is for the waffle charts -query = """SELECT * from checkins where week >= (SELECT DISTINCT(week) FROM checkins ORDER BY week DESC LIMIT 4,1)""" -df = pd.read_sql_query(query, parse_dates='week', con=database) -df -''' - -if __name__ == "__main__": - main() diff --git a/brontosaurus-slicer.sh b/brontosaurus-slicer.sh deleted file mode 100755 index 8235990..0000000 --- a/brontosaurus-slicer.sh +++ /dev/null @@ -1,129 +0,0 @@ -#!/bin/bash - -# This script implements the filter rules in NOTES.md, and splits -# the totals.db into a new cleaned-up table in a "bronto.db" file. -# -# It splits the records into major groups: EPEL, and then also -# "main" Fedora Linux systems, Fedora Rawhide, and Fedora Containers. -# Because "group" is a reserved word in sql, we use "dataset". -# -# It removes the os_ prefix, because without repo_ columns there -# is no ambiguity to resolve. -# -# And, it converts weeknums (which start on January 5th, 1970, -# the first Monday of the epoch) to dates. -# -# We're dropping the os_version field from EL and instead just -# using the first digit (e.g. "8" or "9") as "release". We are -# also using _name_ as variant. We could keep these separate, -# but this way we have the same fields for both types. - - -sqlite3 db/totals.db << EOF - -ATTACH DATABASE 'db/bronto.db' AS bronto; - -DROP TABLE IF EXISTS bronto.checkins; - -CREATE TABLE bronto.checkins( - week INT, - dataset TEXT, - release TEXT, - variant TEXT, - arch TEXT, - age INT CHECK(age<5), - hits INT, - UNIQUE (week,dataset,release,variant,arch,age) -); - -INSERT INTO bronto.checkins - SELECT date(julianday('1970-01-05')+ weeknum * 7) as week, - "fedora_updates_systems" AS dataset, - os_version AS release, - os_variant AS variant, - os_arch AS arch, - sys_age AS age, - SUM(hits) - FROM countme_totals - WHERE (os_name = 'Fedora' or os_name = 'Fedora Linux') - AND repo_tag LIKE 'updates-released-f__' - AND os_version = substr(repo_tag,-2,2) - AND os_arch = repo_arch - AND os_variant != 'container' - AND os_variant != 'toolbx' - AND os_variant != 'snappy' - GROUP BY week,release,variant,arch,age; - -INSERT INTO bronto.checkins - SELECT date(julianday('1970-01-05')+ weeknum * 7) as week, - "fedora_updates_containers" AS dataset, - os_version AS release, - os_variant AS variant, - os_arch AS arch, - sys_age AS age, - SUM(hits) - FROM countme_totals - WHERE (os_name = 'Fedora' or os_name = 'Fedora Linux') - AND repo_tag LIKE 'updates-released-f__' - AND os_version = substr(repo_tag,-2,2) - AND os_arch = repo_arch - AND (os_variant = 'container' OR - os_variant = 'toolbx' OR - os_variant = 'snappy') - GROUP BY week,release,variant,arch,age; - - -INSERT INTO bronto.checkins - SELECT date(julianday('1970-01-05')+ weeknum * 7) as week, - "fedora_rawhide_systems" AS dataset, - os_version AS release, - os_variant AS variant, - os_arch AS arch, - sys_age AS age, - SUM(hits) - FROM countme_totals - WHERE (os_name = 'Fedora' or os_name = 'Fedora Linux') - AND repo_tag = 'fedora-rawhide' OR repo_tag = 'rawhide' - AND os_arch = repo_arch - AND os_variant != 'container' - AND os_variant != 'toolbx' - AND os_variant != 'snappy' - GROUP BY week,release,variant,arch,age; - -INSERT INTO bronto.checkins - SELECT date(julianday('1970-01-05')+ weeknum * 7) as week, - "fedora_rawhide_containers" AS dataset, - os_version AS release, - os_variant AS variant, - os_arch AS arch, - sys_age AS age, - SUM(hits) - FROM countme_totals - WHERE (os_name = 'Fedora' or os_name = 'Fedora Linux') - AND repo_tag = 'rawhide' - AND os_arch = repo_arch - AND (os_variant = 'container' OR - os_variant = 'toolbx' OR - os_variant = 'snappy') - GROUP BY week,release,variant,arch,age; - - -INSERT INTO bronto.checkins - SELECT date(julianday('1970-01-05')+ weeknum * 7) as week, - "epel" AS dataset, - CASE instr(os_version,".") - WHEN 0 THEN os_version - ELSE substr(os_version,0,instr(os_version,".")) - END AS release, - os_name AS variant, - os_arch AS arch, - sys_age AS age, - SUM(hits) - FROM countme_totals - WHERE repo_tag = 'epel-' || release - AND os_arch = repo_arch - GROUP BY week,release,variant,arch,age; - -DETACH bronto; - -EOF diff --git a/brontosaurus-washer.sh b/brontosaurus-washer.sh deleted file mode 100755 index b655cb7..0000000 --- a/brontosaurus-washer.sh +++ /dev/null @@ -1,69 +0,0 @@ -#!/bin/bash -# -# For every table in bronto.db, delete "known bad" variants. -# - - -# Please document each new reason for cleaning something here, so we know -# why later. -# -# The variants 09975472-cc15-4020-9231-cc6743a15b0f and c1e9af0e-5816-4644-9c89-703aa1fdcf04 -# are strings I (mattdm) used for testing. -# -# The variant "CentOS Stream v21.*" is some sort of horrible scripted thing -# apparently, where it keeps getting longer and longer with additional tags -# -# Also, for each table, sets any variant that is '' to 'none', because -# '' is hard to work with. (I think this is when people have manually put -# "VARIANT_ID=", as opposed to not having one. I don't think that's useful -# to track separately from 'generic', really, so an alternative would be -# to merge them... but doing this for now.) -# FIXME: this needs to merge them in case someone starts actually sending -# "none" as the string — we'll get a uniqueness constraint violation. -# -# This is a regex, in case that's not clear. -GARBAGE_VARIANTS="^(09975472-cc15-4020-9231-cc6743a15b0f|c1e9af0e-5816-4644-9c89-703aa1fdcf04|CentOS Stream v21.*)$" - -sqlite3 db/bronto.db << EOF - DELETE FROM checkins WHERE variant REGEXP "$GARBAGE_VARIANTS"; - UPDATE checkins SET variant='none' WHERE variant=''; -EOF - - -# While some test systems ran Fedora Linux 31, the feature landed -# in 32 (released 2020-04-27, so drop all the old stuff. -FEDORA_STARTVER=32 -FEDORA_STARTDAY='2021-01-01' -# And same for EPEL, with EL 8 (DNF feature launched in 8.3 at end of 2020) -EPEL_STARTVER=8 -EPEL_STARTDAY='2021-01-01' -sqlite3 db/bronto.db << EOF - DELETE FROM checkins WHERE dataset GLOB "fedora*" AND release < $FEDORA_STARTVER; - DELETE FROM checkins WHERE dataset GLOB "fedora*" AND week < "$FEDORA_STARTDAY"; - DELETE FROM checkins WHERE dataset GLOB "epel*" AND release < $EPEL_STARTVER; - DELETE FROM checkins WHERE dataset GLOB "epel*" AND week < "$EPEL_STARTDAY"; -EOF - -# Clean up entries for name, arch, or release that show up -# with less than some threshold in _total_ hits in the -# whole database, or where the weekly numbers never exceed some -# small value. This removes both small bursts of nonsense -# and also most long-lived singletons. We may want to revisit -# what these are set to when we have more data. -# -# Note that since we regenerate the whole db from totals.db -# each week, if something exceeds this threshold later, it will -# suddenly appear -THRESHOLD_TOTAL=100 -THRESHOLD_WEEKLY=3 - -for GROUP in $(echo 'SELECT DISTINCT(dataset) FROM checkins;' | sqlite3 ./db/bronto.db); do - sqlite3 db/bronto.db << EOF - DELETE FROM checkins WHERE dataset = "$GROUP" AND variant IN (SELECT variant FROM checkins WHERE dataset = "$GROUP" GROUP BY variant HAVING SUM(hits) < $THRESHOLD_TOTAL); - DELETE FROM checkins WHERE dataset = "$GROUP" AND variant IN (SELECT variant FROM checkins WHERE dataset = "$GROUP" GROUP BY variant HAVING SUM(hits) < $THRESHOLD_WEEKLY); - DELETE FROM checkins WHERE dataset = "$GROUP" AND arch IN (SELECT arch FROM checkins WHERE dataset = "$GROUP" GROUP BY arch HAVING SUM(hits) < $THRESHOLD_TOTAL); - DELETE FROM checkins WHERE dataset = "$GROUP" AND arch IN (SELECT arch FROM checkins WHERE dataset = "$GROUP" GROUP BY arch HAVING SUM(hits) < $THRESHOLD_WEEKLY); - DELETE FROM checkins WHERE dataset = "$GROUP" AND release IN (SELECT release FROM checkins WHERE dataset = "$GROUP" GROUP BY release HAVING SUM(hits) < $THRESHOLD_TOTAL); - DELETE FROM checkins WHERE dataset = "$GROUP" AND release IN (SELECT release FROM checkins WHERE dataset = "$GROUP" GROUP BY release HAVING SUM(hits) < $THRESHOLD_WEEKLY); -EOF -done diff --git a/brontosaurus_egg-sorter.py b/brontosaurus_egg-sorter.py new file mode 100755 index 0000000..e8dff2e --- /dev/null +++ b/brontosaurus_egg-sorter.py @@ -0,0 +1,277 @@ +#!/usr/bin/python3 + +# Written by Matthew Miller + +# This estimates the number of systems of age 1 which turn into long-running +# systems and which are just ephemeral test installs, ci system instances, +# builders, etc., and relabels the latter as "age=0". +# +# The basic concept is: a system reporting as "one" could either be +# persistent (that is, goes into "two" next week) or ephemeral (doesn't). +# Any increase in group two [across versions] _must_ be from systems coming +# from group one. Therefore, at least that number of systems from "one" must +# be "persistent". However, that can undercount if systems from group two go +# offline — it may be that there are more persistent systems than we +# thought. +# +# We can mitigate this a little bit by also considering the flow to groups +# three and four, working backwards, like this: +# +# one,two,three,four = this weeks age values +# one1,two1,three1,four1 = next week's age values +# +# # If four1-four is negative, older more systems are going offline then +# # aging into it. Increases must come from group three, or ghosts +# # (ignored; see below). There may be _more_ from three to four, propping +# # up the value as other systems go offline, but it can't be more than +# # the total increase +# min_flow_to_four = max(four1-four,0) # time only goes forward +# max_flow_to_four = min(three,four1) # could be _total turnover_ +# +# # Without any flow from two, three would go down by the range above. +# # How much _did_ we go down by (if any?) Anything above that must +# # be flow from two (or ghosts!) +# min_flow_to_three = max(three1-three,0) + min_flow_to_four +# # max is what's there in the next week _plus_ what could have +# # moved on. +# max_flow_to_three = min(two,three1 + max_flow_to_four) +# +# # Same deal, but one cohort over... +# min_flow_to_two = max(two1-two,0) + min_flow_to_three +# max_flow_to_two = min(one,two1 + max_flow_to_three) +# +# # Leaving us with ... +# +# min_ephemeral (zero) = one - max_flow_to_two +# min_persistent (one) = min(one,min_flow_to_two) +# +# max_ephemeral (zero) = one - min(one,min_flow_to_two) +# max_persistent (one) = max_flow_to_two +# +# # split? +# +# or +# +# moved_up=min(one,max(two1-two,0) + max(three1-three,0) + max(four1-four,0)) +# zero=one-moved_up +# one=moved_up +# +# *or* +# +# moved_up=min(one,two1 + min(two,three1 + min(three,four1))) +# zero=one-moved_up +# one=moved_up +# +# Of course, this assumes that random new systems won't show up in +# later groups — ghosts! Right now, I'm assuming they're rare enough +# to ignore for the purpose of this estimation. Theoretically, ghost +# systems mean that the minimum estimate is actually too high. +# +# Also: since age 1 is 1 week, all systems must move up (or vanish). +# Age 2 is 3 weeks (weeks 2,3,4), so assuming that the average number +# of new permanant installations is roughly smooth week to week (no +# huge jumps) on some days, that means we can assume turnover of 1/3 for +# that group. Age 3 is 20 weeks (weeks 5-24, inclusive), so turnover should +# be much smaller — more like 1/20! +# +# Also. this it keeps track of the percentage of a given system type that is +# ephemeral, and uses that to guess for missing values (like, the last week +# in the dataset, where there is no "next week" yet.) +# +# To consider: bias towards upgrades when a new release is just out? or does +# that get us too much seeing what we want to see? We could deterimine this +# by the first week systems seen breaks some threshold, or more cleverly +# by noticing when the curve jumps. +# + +import os +import sys +import string +import sqlite3 +import datetime +from collections import Counter +from tokenize import group + +DATAFILE = 'db/bronto.db' + +DATABASE = sqlite3.connect(DATAFILE, detect_types=sqlite3.PARSE_DECLTYPES) + +loopcursor = DATABASE.cursor() +nextcursor = DATABASE.cursor() + +zerocounter = Counter() +onecounter = Counter() + + +loopcursor.execute( + "SELECT DISTINCT(dataset) FROM checkins ORDER BY dataset DESC") +groups = [item for sublist in loopcursor.fetchall() for item in sublist] + + +for group in groups: + zerocounter.clear() + onecounter.clear() + + loopcursor.execute( + 'SELECT * FROM checkins WHERE dataset = :dataset AND age = 1 ORDER BY week', {'dataset': group}) + for row in loopcursor: + (week, group, release, variant, arch, age, hits) = row + + thisone = hits + + # get the other age groups for this type of system, if any + query = """SELECT age,hits FROM checkins WHERE + week = :week AND + dataset = :dataset AND + release = :release AND + variant = :variant AND + arch = :arch AND + age > 1 + ORDER BY age + """ + nextcursor.execute(query, + {"week": week, + "dataset": group, + "release": release, + "variant": variant, + "arch": arch + }) + thisweek = nextcursor.fetchall() + twothis = 0 + threethis = 0 + fourthis = 0 + if thisweek: + for agegroup in thisweek: + (age, hits) = agegroup + if age == 2: + twothis = hits + elif age == 3: + threethis = hits + elif age == 4: + fourthis = hits + else: + raise ValueError() + + # Get the age groups for next week (if any) for this type of system + # For Fedora Linux, we're also including higher release numbers -- + # systems could be upgraded! However, that's unlikely for epel. So.. + if group.split('_', 1)[0] == 'fedora': + query = """SELECT age,sum(hits) FROM checkins WHERE + week = :nextweek AND + dataset = :dataset AND + release >= :release AND + variant = :variant AND + arch = :arch AND + age > 1 + GROUP BY age + ORDER BY age""" + else: + query = """SELECT age,sum(hits) FROM checkins WHERE + week = :nextweek AND + dataset = :dataset AND + release = :release AND + variant = :variant AND + arch = :arch AND + age > 1 + GROUP BY age + ORDER BY age""" + nextcursor.execute(query, + {"nextweek": (datetime.datetime.fromisoformat(week) + datetime.timedelta(weeks=1)).strftime("%Y-%m-%d"), # this was easier when it was weeknums! + "dataset": group, + "release": release, + "variant": variant, + "arch": arch + }) + nextweek = nextcursor.fetchall() + + if nextweek: + + twonext = 0 + threenext = 0 + fournext = 0 + + for agegroup in nextweek: + (age, hits) = agegroup + if age == 2: + twonext = hits + elif age == 3: + threenext = hits + elif age == 4: + fournext = hits + else: + raise ValueError(f"age is {age}") + + # okay, whew. see long comment at top of file for explanation of the theory. + # in practice, there are these three possibilities: + # moved_up_min = max(twonext-twothis, 0) + + # max(threenext - threethis, 0) + + # max(fournext-fourthis, 0) + # moved_up_max = twonext + min(twothis, threenext + min(threethis, fournext)) + # + # moved_up_timebased = int(twothis/3 + threethis/20 + [some estimate of dropout rate of fourthis]) + # + # So what I'm doing here is for each period, going with time-based capped by + # the min and max flow for that age group. And then for age 4, arbitrarily picking + # a ratio. Note that min and timebased separate easily, but the max is difficult + # + # For now, we're assuming _minimum_ flow from age 3 into age 4. + # Why? This makes the bands for age 1 and age 2 look most reasonable. + moved_to_four = max(fournext-fourthis, 0) + moved_to_three = min(max(threethis/20, max(threenext-threethis, 0)+moved_to_four), + min(twothis, threenext + moved_to_four)) + moved_to_two = min(max( + twothis/3, max(twonext-twothis, 0) + moved_to_three), min(thisone, twonext + moved_to_three)) + moved_up = min(thisone, int(moved_to_two)) + new_zero = thisone - moved_up + assert(new_zero >= 0) + new_one = moved_up + assert (new_one == thisone-new_zero) + + # keep a running total for the estimate + zerocounter[(release, variant, arch)] += new_zero + onecounter[(release, variant, arch)] += new_one + + else: # no values for next week, so... estimate! + totalprevious = zerocounter[( + release, variant, arch)] + onecounter[(release, variant, arch)] + if totalprevious: + new_zero = round( + thisone*zerocounter[(release, variant, arch)]/totalprevious) + new_one = thisone-new_zero + else: + # no estimate for this row, so assume all ephemeral + new_zero = thisone + new_one = 0 + + assert new_zero + \ + new_one == thisone, "{} + {} = {}".format( + new_zero, new_one, thisone) + + nextcursor.execute("""INSERT INTO checkins + (week, dataset, release, variant, arch, age, hits) + VALUES (:week, :dataset, :release, :variant, :arch, :age, :hits)""", + {"week": week, + "dataset": group, + "release": release, + "variant": variant, + "arch": arch, + "age": 0, + "hits": new_zero + }) + nextcursor.execute("""REPLACE INTO checkins + (week, dataset, release, variant, arch, age, hits) + VALUES (:week, :dataset, :release, :variant, :arch, :age, :hits)""", + {"week": week, + "dataset": group, + "release": release, + "variant": variant, + "arch": arch, + "age": 1, + "hits": new_one + }) + + # these are just clutter, and it's easier to zap them at the end + # than to avoid making them in the loop. + loopcursor.execute(f"""DELETE from checkins WHERE hits=0""") + + DATABASE.commit() diff --git a/brontosaurus_fight.sh b/brontosaurus_fight.sh new file mode 100755 index 0000000..5726335 --- /dev/null +++ b/brontosaurus_fight.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# +# Create a view which only shows the weeks where each release +# is at its peak. If someone actually is Good At SQL, I would +# not mind help making this more clear. + +sqlite3 db/bronto.db << EOF + DROP VIEW IF EXISTS peak; + CREATE VIEW peak AS + SELECT checkins.week, + checkins.dataset, + checkins.release, + checkins.variant, + checkins.arch, + checkins.age, + checkins.hits + FROM checkins + INNER JOIN + (SELECT week,dataset,release,max(hits) + FROM (SELECT week,dataset,release,sum(hits) AS hits + FROM checkins + GROUP BY week,dataset,release + ORDER BY week) + GROUP BY dataset,release) AS peaks + ON peaks.week = checkins.week + AND peaks.dataset = checkins.dataset + AND peaks.release = checkins.release; +EOF diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py new file mode 100755 index 0000000..29ca465 --- /dev/null +++ b/brontosaurus_plotter.py @@ -0,0 +1,299 @@ +#!/usr/bin/python3 + +import matplotlib.dates as dates +import matplotlib.pyplot as plt + +import sqlite3 +from string import Template + +from pprint import pprint + +import pandas as pd +import toml + +import matplotlib as m + +from brontosaurus_colorizer import load_color_cache, get_colors + +DATAFILE = 'db/bronto.db' + +m.use("Agg") + +m.style.use('seaborn-colorblind') +m.rcParams['font.size'] = 12 +m.rcParams['font.family'] = 'Montserrat' +m.rcParams['legend.frameon'] = False + + + + +def graph_timeseries(config, colormappings, params, dataframe): + """Draws line or area chart for a dataseries over time.""" + + # If we find we have missing data, in the future: + # dataframe.resample('W-MON') + + dataset = params['dataset'] + dataseries = params['dataseries'] + + ################# + # Instead of this, accumulate anything more than 10 into "other" + # ... and do it _elsewhere_ (easier to do before pivot anyway!) + # + limit number of columns to 10 + other + + hidelist = dataframe.div(dataframe.sum( + axis=1), axis=0).max() < 0.2/100 + dataframe.drop(columns=(hidelist[hidelist == True].keys()), inplace=True) + + ################## + # our colors. + # the complication here is keeping the same color for the same label + # across multiple graphs! + cmap = m.colors.ListedColormap(get_colors(colormappings, config['colors'], + dataset, dataseries, list(dataframe.columns))) + + ################## + # and now.... graph it! + + # FIXME: this is ugly + startdate = config['startdate'][dataset.split('_', 1)[0]] + + for view in params['views']: + + match view: + case 'line': + df = dataframe[startdate:] + kind = 'line' + colormap = cmap + case 'stacked': + df = dataframe[startdate:][dataframe.columns[::-1]] + kind = 'area' + colormap = m.colors.ListedColormap(cmap.colors[::-1]) + case 'share': + df = dataframe[startdate:].div( + dataframe.sum(axis=1), axis=0)*100 + kind = 'area' + colormap = cmap + + # Start the actual graph + graph = df.plot(figsize=config['figsize'], + colormap=colormap, kind=kind) + + # Labels and titles and stuff. + ax = plt.gca() + + handles, labels = ax.get_legend_handles_labels() + + # TODO: generalize this + if dataseries == 'age': + labels = list(map(config['age_labels'].get, labels)) + + if view == 'stacked': + handles[:] = handles[::-1] + labels[:] = labels[::-1] + + plt.legend(handles, labels, loc='center left', + bbox_to_anchor=(1.0, 0.5)) + + madlibs = {'dataseries': dataseries, + 'dataset': dataset, + 'view': view, + 'dataseries_label': config['dataseries_labels'][dataseries], + 'dataset_label': config['dataset_labels'][dataset], + 'view_label': config['view_labels'][view]} + + if 'title' in params: + plt.suptitle(Template(params['title']).safe_substitute(madlibs), + fontsize=24) + + # FIX: make work + if 'subtitle' in params: + graph.set_title( + Template(params['subtitle']).safe_substitute(madlibs), + fontsize=14) + + plt.autoscale(enable=True, axis='x', tight=True) + plt.autoscale(enable=True, axis='y', tight=False) + graph.set_ylim([0, None]) + graph.spines['right'].set_visible(False) + graph.spines['top'].set_visible(False) + sFormatter = m.ticker.ScalarFormatter() + sFormatter.set_scientific(False) + graph.yaxis.set_major_formatter(sFormatter) + # graph.xaxis.set_major_formatter(dates.DateFormatter('%b \'%y')) + graph.set_xlabel('') + + for ext in config['image_types']: + graph.figure.savefig(f"images/{ext}/{dataset}/{Template(params['filebase']).safe_substitute(madlibs)}.{ext}", + dpi=config['dpi'], bbox_inches="tight") + + plt.close(graph.figure) + print(f"{Template(params['filebase']).safe_substitute(madlibs)}.{ext}") + + +def graph_releasebars(config, colormappings, params, dataframe): + """Draws each release in the set as a bar chart""" + + # If we find we have missing data, in the future: + # dataframe.resample('W-MON') + + dataset = params['dataset'] + dataseries = params['dataseries'] + + ################# + # Instead of this, accumulate anything more than 10 into "other" + # ... and do it _elsewhere_ (easier to do before pivot anyway!) + # + limit number of columns to 10 + other + + hidelist = dataframe.div(dataframe.sum( + axis=1), axis=0).max() < 0.2/100 + dataframe.drop(columns=(hidelist[hidelist == True].keys()), inplace=True) + + ################## + # our colors. + # the complication here is keeping the same color for the same label + # across multiple graphs! + cmap = m.colors.ListedColormap(get_colors(colormappings, config['colors'], + dataset, dataseries, list(dataframe.columns))) + + ################## + # and now.... graph it! + + # FIXME: this is ugly + startdate = config['startdate'][dataset.split('_', 1)[0]] + + for view in params['views']: + + match view: + case 'stacked': + df = dataframe[startdate:] + kind = 'bar' + colormap = cmap + case 'share': + df = dataframe[startdate:].div( + dataframe.sum(axis=1), axis=0)*100 + kind = 'bar' + colormap = cmap + + # Start the actual graph + graph = df.plot(figsize=config['figsize'], + colormap=colormap, kind=kind, stacked=True) + + # Labels and titles and stuff. + ax = plt.gca() + + handles, labels = ax.get_legend_handles_labels() + + # TODO: generalize this + if dataseries == 'age': + labels = list(map(config['age_labels'].get, labels)) + + if view == 'stacked': + handles[:] = handles[::-1] + labels[:] = labels[::-1] + + plt.legend(handles, labels, loc='center left', + bbox_to_anchor=(1.0, 0.5)) + + madlibs = {'dataseries': dataseries, + 'dataset': dataset, + 'view': view, + 'dataseries_label': config['dataseries_labels'][dataseries], + 'dataset_label': config['dataset_labels'][dataset], + 'view_label': config['view_labels'][view]} + + if 'title' in params: + plt.suptitle(Template(params['title']).safe_substitute(madlibs), + fontsize=24) + + # FIX: make work + if 'subtitle' in params: + graph.set_title( + Template(params['subtitle']).safe_substitute(madlibs), + fontsize=14) + + plt.autoscale(enable=True, axis='x', tight=True) + plt.autoscale(enable=True, axis='y', tight=False) + graph.set_ylim([0, None]) + graph.spines['right'].set_visible(False) + graph.spines['top'].set_visible(False) + sFormatter = m.ticker.ScalarFormatter() + sFormatter.set_scientific(False) + graph.yaxis.set_major_formatter(sFormatter) + # graph.xaxis.set_major_formatter(dates.DateFormatter('%b \'%y')) + graph.set_xlabel('') + + for ext in config['image_types']: + graph.figure.savefig(f"images/{ext}/{dataset}/{Template(params['filebase']).safe_substitute(madlibs)}.{ext}", + dpi=config['dpi'], bbox_inches="tight") + + plt.close(graph.figure) + print(f"{Template(params['filebase']).safe_substitute(madlibs)}.{ext}") + + +########################################### + + +def main(): + + config = toml.load("config.toml") + + + colormappings = load_color_cache(config['color_cache'],config['color_presets']) + + database = sqlite3.connect(DATAFILE, detect_types=sqlite3.PARSE_DECLTYPES) + + if 'timeseries' in config: + for timeseries in config['timeseries']: + params = config['timeseries_defaults'].copy() + params.update(timeseries) + + query = f"""SELECT week, {params['dataseries']}, SUM(hits) as hits + FROM checkins + WHERE dataset =\"{params['dataset']}\" + {params['extraselect']} + GROUP BY week, {params['dataseries']} + ORDER BY week + """ + df = pd.read_sql_query(query, parse_dates='week', con=database) + + graph_timeseries( + config=config, + colormappings=colormappings, + params=params, + dataframe=df.pivot(index='week', columns=params['dataseries'], + values='hits').astype("Int64"), + ) + + if 'byrelease' in config: + for byrelease in config['byrelease']: + params = config['byrelease_defaults'].copy() + params.update(byrelease) + + query = f"""SELECT release, {params['dataseries']}, SUM(hits) as hits + FROM peak + WHERE dataset =\"{params['dataset']}\" + {params['extraselect']} + GROUP BY release, {params['dataseries']} + ORDER BY release + """ + df = pd.read_sql_query(query, con=database) + + graph_releasebars( + config=config, + colormappings=colormappings, + params=params, + dataframe=df.pivot(index='release', columns=params['dataseries'], + values='hits').astype("Int64"), + ) + + +''' +### getting ahead of myself: this is for the waffle charts +query = """SELECT * from checkins where week >= (SELECT DISTINCT(week) FROM checkins ORDER BY week DESC LIMIT 4,1)""" +df = pd.read_sql_query(query, parse_dates='week', con=database) +df +''' + +if __name__ == "__main__": + main() diff --git a/brontosaurus_slicer.py b/brontosaurus_slicer.py new file mode 100755 index 0000000..8235990 --- /dev/null +++ b/brontosaurus_slicer.py @@ -0,0 +1,129 @@ +#!/bin/bash + +# This script implements the filter rules in NOTES.md, and splits +# the totals.db into a new cleaned-up table in a "bronto.db" file. +# +# It splits the records into major groups: EPEL, and then also +# "main" Fedora Linux systems, Fedora Rawhide, and Fedora Containers. +# Because "group" is a reserved word in sql, we use "dataset". +# +# It removes the os_ prefix, because without repo_ columns there +# is no ambiguity to resolve. +# +# And, it converts weeknums (which start on January 5th, 1970, +# the first Monday of the epoch) to dates. +# +# We're dropping the os_version field from EL and instead just +# using the first digit (e.g. "8" or "9") as "release". We are +# also using _name_ as variant. We could keep these separate, +# but this way we have the same fields for both types. + + +sqlite3 db/totals.db << EOF + +ATTACH DATABASE 'db/bronto.db' AS bronto; + +DROP TABLE IF EXISTS bronto.checkins; + +CREATE TABLE bronto.checkins( + week INT, + dataset TEXT, + release TEXT, + variant TEXT, + arch TEXT, + age INT CHECK(age<5), + hits INT, + UNIQUE (week,dataset,release,variant,arch,age) +); + +INSERT INTO bronto.checkins + SELECT date(julianday('1970-01-05')+ weeknum * 7) as week, + "fedora_updates_systems" AS dataset, + os_version AS release, + os_variant AS variant, + os_arch AS arch, + sys_age AS age, + SUM(hits) + FROM countme_totals + WHERE (os_name = 'Fedora' or os_name = 'Fedora Linux') + AND repo_tag LIKE 'updates-released-f__' + AND os_version = substr(repo_tag,-2,2) + AND os_arch = repo_arch + AND os_variant != 'container' + AND os_variant != 'toolbx' + AND os_variant != 'snappy' + GROUP BY week,release,variant,arch,age; + +INSERT INTO bronto.checkins + SELECT date(julianday('1970-01-05')+ weeknum * 7) as week, + "fedora_updates_containers" AS dataset, + os_version AS release, + os_variant AS variant, + os_arch AS arch, + sys_age AS age, + SUM(hits) + FROM countme_totals + WHERE (os_name = 'Fedora' or os_name = 'Fedora Linux') + AND repo_tag LIKE 'updates-released-f__' + AND os_version = substr(repo_tag,-2,2) + AND os_arch = repo_arch + AND (os_variant = 'container' OR + os_variant = 'toolbx' OR + os_variant = 'snappy') + GROUP BY week,release,variant,arch,age; + + +INSERT INTO bronto.checkins + SELECT date(julianday('1970-01-05')+ weeknum * 7) as week, + "fedora_rawhide_systems" AS dataset, + os_version AS release, + os_variant AS variant, + os_arch AS arch, + sys_age AS age, + SUM(hits) + FROM countme_totals + WHERE (os_name = 'Fedora' or os_name = 'Fedora Linux') + AND repo_tag = 'fedora-rawhide' OR repo_tag = 'rawhide' + AND os_arch = repo_arch + AND os_variant != 'container' + AND os_variant != 'toolbx' + AND os_variant != 'snappy' + GROUP BY week,release,variant,arch,age; + +INSERT INTO bronto.checkins + SELECT date(julianday('1970-01-05')+ weeknum * 7) as week, + "fedora_rawhide_containers" AS dataset, + os_version AS release, + os_variant AS variant, + os_arch AS arch, + sys_age AS age, + SUM(hits) + FROM countme_totals + WHERE (os_name = 'Fedora' or os_name = 'Fedora Linux') + AND repo_tag = 'rawhide' + AND os_arch = repo_arch + AND (os_variant = 'container' OR + os_variant = 'toolbx' OR + os_variant = 'snappy') + GROUP BY week,release,variant,arch,age; + + +INSERT INTO bronto.checkins + SELECT date(julianday('1970-01-05')+ weeknum * 7) as week, + "epel" AS dataset, + CASE instr(os_version,".") + WHEN 0 THEN os_version + ELSE substr(os_version,0,instr(os_version,".")) + END AS release, + os_name AS variant, + os_arch AS arch, + sys_age AS age, + SUM(hits) + FROM countme_totals + WHERE repo_tag = 'epel-' || release + AND os_arch = repo_arch + GROUP BY week,release,variant,arch,age; + +DETACH bronto; + +EOF diff --git a/brontosaurus_washer.py b/brontosaurus_washer.py new file mode 100755 index 0000000..b655cb7 --- /dev/null +++ b/brontosaurus_washer.py @@ -0,0 +1,69 @@ +#!/bin/bash +# +# For every table in bronto.db, delete "known bad" variants. +# + + +# Please document each new reason for cleaning something here, so we know +# why later. +# +# The variants 09975472-cc15-4020-9231-cc6743a15b0f and c1e9af0e-5816-4644-9c89-703aa1fdcf04 +# are strings I (mattdm) used for testing. +# +# The variant "CentOS Stream v21.*" is some sort of horrible scripted thing +# apparently, where it keeps getting longer and longer with additional tags +# +# Also, for each table, sets any variant that is '' to 'none', because +# '' is hard to work with. (I think this is when people have manually put +# "VARIANT_ID=", as opposed to not having one. I don't think that's useful +# to track separately from 'generic', really, so an alternative would be +# to merge them... but doing this for now.) +# FIXME: this needs to merge them in case someone starts actually sending +# "none" as the string — we'll get a uniqueness constraint violation. +# +# This is a regex, in case that's not clear. +GARBAGE_VARIANTS="^(09975472-cc15-4020-9231-cc6743a15b0f|c1e9af0e-5816-4644-9c89-703aa1fdcf04|CentOS Stream v21.*)$" + +sqlite3 db/bronto.db << EOF + DELETE FROM checkins WHERE variant REGEXP "$GARBAGE_VARIANTS"; + UPDATE checkins SET variant='none' WHERE variant=''; +EOF + + +# While some test systems ran Fedora Linux 31, the feature landed +# in 32 (released 2020-04-27, so drop all the old stuff. +FEDORA_STARTVER=32 +FEDORA_STARTDAY='2021-01-01' +# And same for EPEL, with EL 8 (DNF feature launched in 8.3 at end of 2020) +EPEL_STARTVER=8 +EPEL_STARTDAY='2021-01-01' +sqlite3 db/bronto.db << EOF + DELETE FROM checkins WHERE dataset GLOB "fedora*" AND release < $FEDORA_STARTVER; + DELETE FROM checkins WHERE dataset GLOB "fedora*" AND week < "$FEDORA_STARTDAY"; + DELETE FROM checkins WHERE dataset GLOB "epel*" AND release < $EPEL_STARTVER; + DELETE FROM checkins WHERE dataset GLOB "epel*" AND week < "$EPEL_STARTDAY"; +EOF + +# Clean up entries for name, arch, or release that show up +# with less than some threshold in _total_ hits in the +# whole database, or where the weekly numbers never exceed some +# small value. This removes both small bursts of nonsense +# and also most long-lived singletons. We may want to revisit +# what these are set to when we have more data. +# +# Note that since we regenerate the whole db from totals.db +# each week, if something exceeds this threshold later, it will +# suddenly appear +THRESHOLD_TOTAL=100 +THRESHOLD_WEEKLY=3 + +for GROUP in $(echo 'SELECT DISTINCT(dataset) FROM checkins;' | sqlite3 ./db/bronto.db); do + sqlite3 db/bronto.db << EOF + DELETE FROM checkins WHERE dataset = "$GROUP" AND variant IN (SELECT variant FROM checkins WHERE dataset = "$GROUP" GROUP BY variant HAVING SUM(hits) < $THRESHOLD_TOTAL); + DELETE FROM checkins WHERE dataset = "$GROUP" AND variant IN (SELECT variant FROM checkins WHERE dataset = "$GROUP" GROUP BY variant HAVING SUM(hits) < $THRESHOLD_WEEKLY); + DELETE FROM checkins WHERE dataset = "$GROUP" AND arch IN (SELECT arch FROM checkins WHERE dataset = "$GROUP" GROUP BY arch HAVING SUM(hits) < $THRESHOLD_TOTAL); + DELETE FROM checkins WHERE dataset = "$GROUP" AND arch IN (SELECT arch FROM checkins WHERE dataset = "$GROUP" GROUP BY arch HAVING SUM(hits) < $THRESHOLD_WEEKLY); + DELETE FROM checkins WHERE dataset = "$GROUP" AND release IN (SELECT release FROM checkins WHERE dataset = "$GROUP" GROUP BY release HAVING SUM(hits) < $THRESHOLD_TOTAL); + DELETE FROM checkins WHERE dataset = "$GROUP" AND release IN (SELECT release FROM checkins WHERE dataset = "$GROUP" GROUP BY release HAVING SUM(hits) < $THRESHOLD_WEEKLY); +EOF +done diff --git a/run.sh b/run.sh index 09b84b5..02432b0 100755 --- a/run.sh +++ b/run.sh @@ -47,7 +47,7 @@ echo -n "* Fossilizing ancient images... " echo " buried." echo -n "* Slicing brontosauruses... " - ./brontosaurus-slicer.sh + ./brontosaurus_slicer.sh if [[ $? != 0 ]]; then echo "! Oops." exit 1 @@ -55,7 +55,7 @@ echo -n "* Slicing brontosauruses... " echo " into bits." echo -n "* Scrubbing off the dirt... " - ./brontosaurus-washer.sh + ./brontosaurus_washer.sh if [[ $? != 0 ]]; then echo "! Oops." exit 1 @@ -63,7 +63,7 @@ echo -n "* Scrubbing off the dirt... " echo " shiny!" echo -n "* Finding the strongest... " - ./brontosaurus-fight.sh + ./brontosaurus_fight.sh if [[ $? != 0 ]]; then echo "! Oops." exit 1 @@ -71,7 +71,7 @@ echo -n "* Finding the strongest... " echo " rarrhhhhr!" echo -n "* Sorting the eggs... " - ./brontosaurus-egg-sorter.py + ./brontosaurus_egg-sorter.py if [[ $? != 0 ]]; then echo "! Oops." exit 1 @@ -86,12 +86,12 @@ echo " Built!" echo "* Painting the feathers..." rm db/color-cache.toml 2> /dev/null - ./brontosaurus-colorizer.py + ./brontosaurus_colorizer.py echo " Vibrant!" echo "* Drawing portraits from the fossilized remains... " LINES=$(toml -dj < config.toml |jq '.timeseries[].views.value[].value'|wc -l) - ./brontosaurus-plotter.py | pv -F " %p %e" -w60 -l -s $LINES > /dev/null + ./brontosaurus_plotter.py | pv -F " %p %e" -w60 -l -s $LINES > /dev/null if [[ $? != 0 ]]; then echo "! Oops." exit 1 From 2701b4551a80acd5dda6639d71df5869bf7e9b52 Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 23 2022 21:56:02 +0000 Subject: [PATCH 16/49] um, yeah, python still :) --- diff --git a/brontosaurus_slicer.py b/brontosaurus_slicer.py deleted file mode 100755 index 8235990..0000000 --- a/brontosaurus_slicer.py +++ /dev/null @@ -1,129 +0,0 @@ -#!/bin/bash - -# This script implements the filter rules in NOTES.md, and splits -# the totals.db into a new cleaned-up table in a "bronto.db" file. -# -# It splits the records into major groups: EPEL, and then also -# "main" Fedora Linux systems, Fedora Rawhide, and Fedora Containers. -# Because "group" is a reserved word in sql, we use "dataset". -# -# It removes the os_ prefix, because without repo_ columns there -# is no ambiguity to resolve. -# -# And, it converts weeknums (which start on January 5th, 1970, -# the first Monday of the epoch) to dates. -# -# We're dropping the os_version field from EL and instead just -# using the first digit (e.g. "8" or "9") as "release". We are -# also using _name_ as variant. We could keep these separate, -# but this way we have the same fields for both types. - - -sqlite3 db/totals.db << EOF - -ATTACH DATABASE 'db/bronto.db' AS bronto; - -DROP TABLE IF EXISTS bronto.checkins; - -CREATE TABLE bronto.checkins( - week INT, - dataset TEXT, - release TEXT, - variant TEXT, - arch TEXT, - age INT CHECK(age<5), - hits INT, - UNIQUE (week,dataset,release,variant,arch,age) -); - -INSERT INTO bronto.checkins - SELECT date(julianday('1970-01-05')+ weeknum * 7) as week, - "fedora_updates_systems" AS dataset, - os_version AS release, - os_variant AS variant, - os_arch AS arch, - sys_age AS age, - SUM(hits) - FROM countme_totals - WHERE (os_name = 'Fedora' or os_name = 'Fedora Linux') - AND repo_tag LIKE 'updates-released-f__' - AND os_version = substr(repo_tag,-2,2) - AND os_arch = repo_arch - AND os_variant != 'container' - AND os_variant != 'toolbx' - AND os_variant != 'snappy' - GROUP BY week,release,variant,arch,age; - -INSERT INTO bronto.checkins - SELECT date(julianday('1970-01-05')+ weeknum * 7) as week, - "fedora_updates_containers" AS dataset, - os_version AS release, - os_variant AS variant, - os_arch AS arch, - sys_age AS age, - SUM(hits) - FROM countme_totals - WHERE (os_name = 'Fedora' or os_name = 'Fedora Linux') - AND repo_tag LIKE 'updates-released-f__' - AND os_version = substr(repo_tag,-2,2) - AND os_arch = repo_arch - AND (os_variant = 'container' OR - os_variant = 'toolbx' OR - os_variant = 'snappy') - GROUP BY week,release,variant,arch,age; - - -INSERT INTO bronto.checkins - SELECT date(julianday('1970-01-05')+ weeknum * 7) as week, - "fedora_rawhide_systems" AS dataset, - os_version AS release, - os_variant AS variant, - os_arch AS arch, - sys_age AS age, - SUM(hits) - FROM countme_totals - WHERE (os_name = 'Fedora' or os_name = 'Fedora Linux') - AND repo_tag = 'fedora-rawhide' OR repo_tag = 'rawhide' - AND os_arch = repo_arch - AND os_variant != 'container' - AND os_variant != 'toolbx' - AND os_variant != 'snappy' - GROUP BY week,release,variant,arch,age; - -INSERT INTO bronto.checkins - SELECT date(julianday('1970-01-05')+ weeknum * 7) as week, - "fedora_rawhide_containers" AS dataset, - os_version AS release, - os_variant AS variant, - os_arch AS arch, - sys_age AS age, - SUM(hits) - FROM countme_totals - WHERE (os_name = 'Fedora' or os_name = 'Fedora Linux') - AND repo_tag = 'rawhide' - AND os_arch = repo_arch - AND (os_variant = 'container' OR - os_variant = 'toolbx' OR - os_variant = 'snappy') - GROUP BY week,release,variant,arch,age; - - -INSERT INTO bronto.checkins - SELECT date(julianday('1970-01-05')+ weeknum * 7) as week, - "epel" AS dataset, - CASE instr(os_version,".") - WHEN 0 THEN os_version - ELSE substr(os_version,0,instr(os_version,".")) - END AS release, - os_name AS variant, - os_arch AS arch, - sys_age AS age, - SUM(hits) - FROM countme_totals - WHERE repo_tag = 'epel-' || release - AND os_arch = repo_arch - GROUP BY week,release,variant,arch,age; - -DETACH bronto; - -EOF diff --git a/brontosaurus_slicer.sh b/brontosaurus_slicer.sh new file mode 100755 index 0000000..8235990 --- /dev/null +++ b/brontosaurus_slicer.sh @@ -0,0 +1,129 @@ +#!/bin/bash + +# This script implements the filter rules in NOTES.md, and splits +# the totals.db into a new cleaned-up table in a "bronto.db" file. +# +# It splits the records into major groups: EPEL, and then also +# "main" Fedora Linux systems, Fedora Rawhide, and Fedora Containers. +# Because "group" is a reserved word in sql, we use "dataset". +# +# It removes the os_ prefix, because without repo_ columns there +# is no ambiguity to resolve. +# +# And, it converts weeknums (which start on January 5th, 1970, +# the first Monday of the epoch) to dates. +# +# We're dropping the os_version field from EL and instead just +# using the first digit (e.g. "8" or "9") as "release". We are +# also using _name_ as variant. We could keep these separate, +# but this way we have the same fields for both types. + + +sqlite3 db/totals.db << EOF + +ATTACH DATABASE 'db/bronto.db' AS bronto; + +DROP TABLE IF EXISTS bronto.checkins; + +CREATE TABLE bronto.checkins( + week INT, + dataset TEXT, + release TEXT, + variant TEXT, + arch TEXT, + age INT CHECK(age<5), + hits INT, + UNIQUE (week,dataset,release,variant,arch,age) +); + +INSERT INTO bronto.checkins + SELECT date(julianday('1970-01-05')+ weeknum * 7) as week, + "fedora_updates_systems" AS dataset, + os_version AS release, + os_variant AS variant, + os_arch AS arch, + sys_age AS age, + SUM(hits) + FROM countme_totals + WHERE (os_name = 'Fedora' or os_name = 'Fedora Linux') + AND repo_tag LIKE 'updates-released-f__' + AND os_version = substr(repo_tag,-2,2) + AND os_arch = repo_arch + AND os_variant != 'container' + AND os_variant != 'toolbx' + AND os_variant != 'snappy' + GROUP BY week,release,variant,arch,age; + +INSERT INTO bronto.checkins + SELECT date(julianday('1970-01-05')+ weeknum * 7) as week, + "fedora_updates_containers" AS dataset, + os_version AS release, + os_variant AS variant, + os_arch AS arch, + sys_age AS age, + SUM(hits) + FROM countme_totals + WHERE (os_name = 'Fedora' or os_name = 'Fedora Linux') + AND repo_tag LIKE 'updates-released-f__' + AND os_version = substr(repo_tag,-2,2) + AND os_arch = repo_arch + AND (os_variant = 'container' OR + os_variant = 'toolbx' OR + os_variant = 'snappy') + GROUP BY week,release,variant,arch,age; + + +INSERT INTO bronto.checkins + SELECT date(julianday('1970-01-05')+ weeknum * 7) as week, + "fedora_rawhide_systems" AS dataset, + os_version AS release, + os_variant AS variant, + os_arch AS arch, + sys_age AS age, + SUM(hits) + FROM countme_totals + WHERE (os_name = 'Fedora' or os_name = 'Fedora Linux') + AND repo_tag = 'fedora-rawhide' OR repo_tag = 'rawhide' + AND os_arch = repo_arch + AND os_variant != 'container' + AND os_variant != 'toolbx' + AND os_variant != 'snappy' + GROUP BY week,release,variant,arch,age; + +INSERT INTO bronto.checkins + SELECT date(julianday('1970-01-05')+ weeknum * 7) as week, + "fedora_rawhide_containers" AS dataset, + os_version AS release, + os_variant AS variant, + os_arch AS arch, + sys_age AS age, + SUM(hits) + FROM countme_totals + WHERE (os_name = 'Fedora' or os_name = 'Fedora Linux') + AND repo_tag = 'rawhide' + AND os_arch = repo_arch + AND (os_variant = 'container' OR + os_variant = 'toolbx' OR + os_variant = 'snappy') + GROUP BY week,release,variant,arch,age; + + +INSERT INTO bronto.checkins + SELECT date(julianday('1970-01-05')+ weeknum * 7) as week, + "epel" AS dataset, + CASE instr(os_version,".") + WHEN 0 THEN os_version + ELSE substr(os_version,0,instr(os_version,".")) + END AS release, + os_name AS variant, + os_arch AS arch, + sys_age AS age, + SUM(hits) + FROM countme_totals + WHERE repo_tag = 'epel-' || release + AND os_arch = repo_arch + GROUP BY week,release,variant,arch,age; + +DETACH bronto; + +EOF diff --git a/brontosaurus_washer.py b/brontosaurus_washer.py deleted file mode 100755 index b655cb7..0000000 --- a/brontosaurus_washer.py +++ /dev/null @@ -1,69 +0,0 @@ -#!/bin/bash -# -# For every table in bronto.db, delete "known bad" variants. -# - - -# Please document each new reason for cleaning something here, so we know -# why later. -# -# The variants 09975472-cc15-4020-9231-cc6743a15b0f and c1e9af0e-5816-4644-9c89-703aa1fdcf04 -# are strings I (mattdm) used for testing. -# -# The variant "CentOS Stream v21.*" is some sort of horrible scripted thing -# apparently, where it keeps getting longer and longer with additional tags -# -# Also, for each table, sets any variant that is '' to 'none', because -# '' is hard to work with. (I think this is when people have manually put -# "VARIANT_ID=", as opposed to not having one. I don't think that's useful -# to track separately from 'generic', really, so an alternative would be -# to merge them... but doing this for now.) -# FIXME: this needs to merge them in case someone starts actually sending -# "none" as the string — we'll get a uniqueness constraint violation. -# -# This is a regex, in case that's not clear. -GARBAGE_VARIANTS="^(09975472-cc15-4020-9231-cc6743a15b0f|c1e9af0e-5816-4644-9c89-703aa1fdcf04|CentOS Stream v21.*)$" - -sqlite3 db/bronto.db << EOF - DELETE FROM checkins WHERE variant REGEXP "$GARBAGE_VARIANTS"; - UPDATE checkins SET variant='none' WHERE variant=''; -EOF - - -# While some test systems ran Fedora Linux 31, the feature landed -# in 32 (released 2020-04-27, so drop all the old stuff. -FEDORA_STARTVER=32 -FEDORA_STARTDAY='2021-01-01' -# And same for EPEL, with EL 8 (DNF feature launched in 8.3 at end of 2020) -EPEL_STARTVER=8 -EPEL_STARTDAY='2021-01-01' -sqlite3 db/bronto.db << EOF - DELETE FROM checkins WHERE dataset GLOB "fedora*" AND release < $FEDORA_STARTVER; - DELETE FROM checkins WHERE dataset GLOB "fedora*" AND week < "$FEDORA_STARTDAY"; - DELETE FROM checkins WHERE dataset GLOB "epel*" AND release < $EPEL_STARTVER; - DELETE FROM checkins WHERE dataset GLOB "epel*" AND week < "$EPEL_STARTDAY"; -EOF - -# Clean up entries for name, arch, or release that show up -# with less than some threshold in _total_ hits in the -# whole database, or where the weekly numbers never exceed some -# small value. This removes both small bursts of nonsense -# and also most long-lived singletons. We may want to revisit -# what these are set to when we have more data. -# -# Note that since we regenerate the whole db from totals.db -# each week, if something exceeds this threshold later, it will -# suddenly appear -THRESHOLD_TOTAL=100 -THRESHOLD_WEEKLY=3 - -for GROUP in $(echo 'SELECT DISTINCT(dataset) FROM checkins;' | sqlite3 ./db/bronto.db); do - sqlite3 db/bronto.db << EOF - DELETE FROM checkins WHERE dataset = "$GROUP" AND variant IN (SELECT variant FROM checkins WHERE dataset = "$GROUP" GROUP BY variant HAVING SUM(hits) < $THRESHOLD_TOTAL); - DELETE FROM checkins WHERE dataset = "$GROUP" AND variant IN (SELECT variant FROM checkins WHERE dataset = "$GROUP" GROUP BY variant HAVING SUM(hits) < $THRESHOLD_WEEKLY); - DELETE FROM checkins WHERE dataset = "$GROUP" AND arch IN (SELECT arch FROM checkins WHERE dataset = "$GROUP" GROUP BY arch HAVING SUM(hits) < $THRESHOLD_TOTAL); - DELETE FROM checkins WHERE dataset = "$GROUP" AND arch IN (SELECT arch FROM checkins WHERE dataset = "$GROUP" GROUP BY arch HAVING SUM(hits) < $THRESHOLD_WEEKLY); - DELETE FROM checkins WHERE dataset = "$GROUP" AND release IN (SELECT release FROM checkins WHERE dataset = "$GROUP" GROUP BY release HAVING SUM(hits) < $THRESHOLD_TOTAL); - DELETE FROM checkins WHERE dataset = "$GROUP" AND release IN (SELECT release FROM checkins WHERE dataset = "$GROUP" GROUP BY release HAVING SUM(hits) < $THRESHOLD_WEEKLY); -EOF -done diff --git a/brontosaurus_washer.sh b/brontosaurus_washer.sh new file mode 100755 index 0000000..b655cb7 --- /dev/null +++ b/brontosaurus_washer.sh @@ -0,0 +1,69 @@ +#!/bin/bash +# +# For every table in bronto.db, delete "known bad" variants. +# + + +# Please document each new reason for cleaning something here, so we know +# why later. +# +# The variants 09975472-cc15-4020-9231-cc6743a15b0f and c1e9af0e-5816-4644-9c89-703aa1fdcf04 +# are strings I (mattdm) used for testing. +# +# The variant "CentOS Stream v21.*" is some sort of horrible scripted thing +# apparently, where it keeps getting longer and longer with additional tags +# +# Also, for each table, sets any variant that is '' to 'none', because +# '' is hard to work with. (I think this is when people have manually put +# "VARIANT_ID=", as opposed to not having one. I don't think that's useful +# to track separately from 'generic', really, so an alternative would be +# to merge them... but doing this for now.) +# FIXME: this needs to merge them in case someone starts actually sending +# "none" as the string — we'll get a uniqueness constraint violation. +# +# This is a regex, in case that's not clear. +GARBAGE_VARIANTS="^(09975472-cc15-4020-9231-cc6743a15b0f|c1e9af0e-5816-4644-9c89-703aa1fdcf04|CentOS Stream v21.*)$" + +sqlite3 db/bronto.db << EOF + DELETE FROM checkins WHERE variant REGEXP "$GARBAGE_VARIANTS"; + UPDATE checkins SET variant='none' WHERE variant=''; +EOF + + +# While some test systems ran Fedora Linux 31, the feature landed +# in 32 (released 2020-04-27, so drop all the old stuff. +FEDORA_STARTVER=32 +FEDORA_STARTDAY='2021-01-01' +# And same for EPEL, with EL 8 (DNF feature launched in 8.3 at end of 2020) +EPEL_STARTVER=8 +EPEL_STARTDAY='2021-01-01' +sqlite3 db/bronto.db << EOF + DELETE FROM checkins WHERE dataset GLOB "fedora*" AND release < $FEDORA_STARTVER; + DELETE FROM checkins WHERE dataset GLOB "fedora*" AND week < "$FEDORA_STARTDAY"; + DELETE FROM checkins WHERE dataset GLOB "epel*" AND release < $EPEL_STARTVER; + DELETE FROM checkins WHERE dataset GLOB "epel*" AND week < "$EPEL_STARTDAY"; +EOF + +# Clean up entries for name, arch, or release that show up +# with less than some threshold in _total_ hits in the +# whole database, or where the weekly numbers never exceed some +# small value. This removes both small bursts of nonsense +# and also most long-lived singletons. We may want to revisit +# what these are set to when we have more data. +# +# Note that since we regenerate the whole db from totals.db +# each week, if something exceeds this threshold later, it will +# suddenly appear +THRESHOLD_TOTAL=100 +THRESHOLD_WEEKLY=3 + +for GROUP in $(echo 'SELECT DISTINCT(dataset) FROM checkins;' | sqlite3 ./db/bronto.db); do + sqlite3 db/bronto.db << EOF + DELETE FROM checkins WHERE dataset = "$GROUP" AND variant IN (SELECT variant FROM checkins WHERE dataset = "$GROUP" GROUP BY variant HAVING SUM(hits) < $THRESHOLD_TOTAL); + DELETE FROM checkins WHERE dataset = "$GROUP" AND variant IN (SELECT variant FROM checkins WHERE dataset = "$GROUP" GROUP BY variant HAVING SUM(hits) < $THRESHOLD_WEEKLY); + DELETE FROM checkins WHERE dataset = "$GROUP" AND arch IN (SELECT arch FROM checkins WHERE dataset = "$GROUP" GROUP BY arch HAVING SUM(hits) < $THRESHOLD_TOTAL); + DELETE FROM checkins WHERE dataset = "$GROUP" AND arch IN (SELECT arch FROM checkins WHERE dataset = "$GROUP" GROUP BY arch HAVING SUM(hits) < $THRESHOLD_WEEKLY); + DELETE FROM checkins WHERE dataset = "$GROUP" AND release IN (SELECT release FROM checkins WHERE dataset = "$GROUP" GROUP BY release HAVING SUM(hits) < $THRESHOLD_TOTAL); + DELETE FROM checkins WHERE dataset = "$GROUP" AND release IN (SELECT release FROM checkins WHERE dataset = "$GROUP" GROUP BY release HAVING SUM(hits) < $THRESHOLD_WEEKLY); +EOF +done From b5f0debf73559f007deed675e1b6221e21f9eae7 Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 23 2022 21:56:36 +0000 Subject: [PATCH 17/49] one line, sure --- diff --git a/run.sh b/run.sh index 02432b0..a8d85a0 100755 --- a/run.sh +++ b/run.sh @@ -84,10 +84,10 @@ echo "* Creating cages for different exhibits..." done echo " Built!" -echo "* Painting the feathers..." +echo -n "* Painting the feathers..." rm db/color-cache.toml 2> /dev/null ./brontosaurus_colorizer.py -echo " Vibrant!" +echo " vibrant!" echo "* Drawing portraits from the fossilized remains... " LINES=$(toml -dj < config.toml |jq '.timeseries[].views.value[].value'|wc -l) From 3e01bc96db958bcb841dc5436376f30915ce35cd Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 23 2022 22:49:03 +0000 Subject: [PATCH 18/49] maybe I'm just hitting myself in the face making the cache be toml? --- diff --git a/brontosaurus_colorizer.py b/brontosaurus_colorizer.py index 878f4dc..e3870f4 100755 --- a/brontosaurus_colorizer.py +++ b/brontosaurus_colorizer.py @@ -15,7 +15,7 @@ from collections import OrderedDict import toml import re - +from deepmerge import always_merger def get_colors(colormappings, colorlist, dataset, dataseries, items): @@ -28,8 +28,9 @@ def get_colors(colormappings, colorlist, dataset, dataseries, items): outcolors = [] for item in items: if item not in colormappings[key]: - colormappings[key][str(item)] = colorlist[len(colormappings[key]) % len(colorlist)] - outcolors.append(colormappings[key][str(item)]) + print(f"Miss {item} in {key}") + colormappings[key][item] = colorlist[len(colormappings[key]) % len(colorlist)] + outcolors.append(colormappings[key][item]) return outcolors @@ -45,7 +46,7 @@ def load_color_cache(cachefile,presetfile): try: presets = toml.load(presetfile) - cached.update(presets) + always_merger.merge(cached,presets) except FileNotFoundError: print(f"No color preset file {presetfile} found.") diff --git a/color-presets.toml b/color-presets.toml index dd54edb..8c0daf7 100644 --- a/color-presets.toml +++ b/color-presets.toml @@ -1,8 +1,8 @@ ["epel.variant"] -"CentOS Linux" = "#101010" +"CentOS Linux" = "#808080" "Red Hat Enterprise Linux" = "#ee0000" "CentOS Stream" = "#a14a8c" "Rocky Linux" = "#10b981" "AlmaLinux" = "#ffcc0a" -"Oracle Linux Server" = "#aaaaaa" +"Oracle Linux Server" = "#101010" "CloudLinux" = "#0097f3" From c1080ba0f8906845e8dfc41ad6de60b21b50ea50 Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 24 2022 00:07:37 +0000 Subject: [PATCH 19/49] there now toml works --- diff --git a/brontosaurus_colorizer.py b/brontosaurus_colorizer.py index e3870f4..c39cb4f 100755 --- a/brontosaurus_colorizer.py +++ b/brontosaurus_colorizer.py @@ -27,10 +27,10 @@ def get_colors(colormappings, colorlist, dataset, dataseries, items): # and save that for later. outcolors = [] for item in items: - if item not in colormappings[key]: + if str(item) not in colormappings[key]: print(f"Miss {item} in {key}") - colormappings[key][item] = colorlist[len(colormappings[key]) % len(colorlist)] - outcolors.append(colormappings[key][item]) + colormappings[key][str(item)] = colorlist[len(colormappings[key]) % len(colorlist)] + outcolors.append(colormappings[key][str(item)]) return outcolors @@ -54,7 +54,10 @@ def load_color_cache(cachefile,presetfile): # gotta do this because we want a defaultdict but # toml load just gives us a regular dict. for key in cached.keys(): - colormappings[key] = cached[key].copy() + # and this because we want the item keys to be strings + # even if they look like integers + for (item,color) in cached[key].items(): + colormappings[key][str(item)] = cached[key][item] return colormappings From cc15d2400a73e80ec9539a3e597f0205e0540297 Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 24 2022 01:35:59 +0000 Subject: [PATCH 20/49] good enough for now! --- diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py index 29ca465..46d972c 100755 --- a/brontosaurus_plotter.py +++ b/brontosaurus_plotter.py @@ -1,4 +1,17 @@ #!/usr/bin/python3 +""" +Brontosaurus Plotter + +Usage: + brontosaurus_plotter.py timeseries ( line | stacked | share ) [--exclude ...|--include ...] [--cutoff ] + brontosaurus_plotter.py byrelease ( stacked | split ) [--exclude ...|--include ...] [--cutoff ] + brontosaurus_plotter.py show + +Options: + --exclude ... In the form `dataseries:element`. Can repeat. + --include ... As above, but include _only_ these. + --cutoff Drop items where the dataseries has less than n total hits +""" import matplotlib.dates as dates import matplotlib.pyplot as plt @@ -13,6 +26,8 @@ import toml import matplotlib as m +from docopt import docopt + from brontosaurus_colorizer import load_color_cache, get_colors DATAFILE = 'db/bronto.db' @@ -27,113 +42,10 @@ m.rcParams['legend.frameon'] = False -def graph_timeseries(config, colormappings, params, dataframe): - """Draws line or area chart for a dataseries over time.""" - - # If we find we have missing data, in the future: - # dataframe.resample('W-MON') - - dataset = params['dataset'] - dataseries = params['dataseries'] - - ################# - # Instead of this, accumulate anything more than 10 into "other" - # ... and do it _elsewhere_ (easier to do before pivot anyway!) - # + limit number of columns to 10 + other - - hidelist = dataframe.div(dataframe.sum( - axis=1), axis=0).max() < 0.2/100 - dataframe.drop(columns=(hidelist[hidelist == True].keys()), inplace=True) - - ################## - # our colors. - # the complication here is keeping the same color for the same label - # across multiple graphs! - cmap = m.colors.ListedColormap(get_colors(colormappings, config['colors'], - dataset, dataseries, list(dataframe.columns))) - - ################## - # and now.... graph it! - - # FIXME: this is ugly - startdate = config['startdate'][dataset.split('_', 1)[0]] - - for view in params['views']: - - match view: - case 'line': - df = dataframe[startdate:] - kind = 'line' - colormap = cmap - case 'stacked': - df = dataframe[startdate:][dataframe.columns[::-1]] - kind = 'area' - colormap = m.colors.ListedColormap(cmap.colors[::-1]) - case 'share': - df = dataframe[startdate:].div( - dataframe.sum(axis=1), axis=0)*100 - kind = 'area' - colormap = cmap - - # Start the actual graph - graph = df.plot(figsize=config['figsize'], - colormap=colormap, kind=kind) - - # Labels and titles and stuff. - ax = plt.gca() - - handles, labels = ax.get_legend_handles_labels() - - # TODO: generalize this - if dataseries == 'age': - labels = list(map(config['age_labels'].get, labels)) - - if view == 'stacked': - handles[:] = handles[::-1] - labels[:] = labels[::-1] - - plt.legend(handles, labels, loc='center left', - bbox_to_anchor=(1.0, 0.5)) - - madlibs = {'dataseries': dataseries, - 'dataset': dataset, - 'view': view, - 'dataseries_label': config['dataseries_labels'][dataseries], - 'dataset_label': config['dataset_labels'][dataset], - 'view_label': config['view_labels'][view]} - - if 'title' in params: - plt.suptitle(Template(params['title']).safe_substitute(madlibs), - fontsize=24) - - # FIX: make work - if 'subtitle' in params: - graph.set_title( - Template(params['subtitle']).safe_substitute(madlibs), - fontsize=14) - plt.autoscale(enable=True, axis='x', tight=True) - plt.autoscale(enable=True, axis='y', tight=False) - graph.set_ylim([0, None]) - graph.spines['right'].set_visible(False) - graph.spines['top'].set_visible(False) - sFormatter = m.ticker.ScalarFormatter() - sFormatter.set_scientific(False) - graph.yaxis.set_major_formatter(sFormatter) - # graph.xaxis.set_major_formatter(dates.DateFormatter('%b \'%y')) - graph.set_xlabel('') - - for ext in config['image_types']: - graph.figure.savefig(f"images/{ext}/{dataset}/{Template(params['filebase']).safe_substitute(madlibs)}.{ext}", - dpi=config['dpi'], bbox_inches="tight") - - plt.close(graph.figure) - print(f"{Template(params['filebase']).safe_substitute(madlibs)}.{ext}") - - -def graph_releasebars(config, colormappings, params, dataframe): - """Draws each release in the set as a bar chart""" +def graph(config, colormappings, params, dataframe): + # If we find we have missing data, in the future: # dataframe.resample('W-MON') @@ -162,73 +74,89 @@ def graph_releasebars(config, colormappings, params, dataframe): # FIXME: this is ugly startdate = config['startdate'][dataset.split('_', 1)[0]] - for view in params['views']: - - match view: - case 'stacked': - df = dataframe[startdate:] - kind = 'bar' - colormap = cmap - case 'share': + stacked = True + + match params['type']: + case 'byrelease': + kind = 'bar' + case 'timeseries': + kind = 'area' + + match params['view']: + case 'line': + df = dataframe[startdate:] + kind = 'line' # overrides 'area' + stacked = False # true everywhere else! + colormap = cmap + case 'stacked': + df = dataframe[startdate:][dataframe.columns[::-1]] + colormap = m.colors.ListedColormap(cmap.colors[::-1]) + case 'share': + if dataseries == 'age': + # lower numbers are newer! + df = dataframe[startdate:][dataframe.columns[::-1]].div( + dataframe.sum(axis=1), axis=0)*100 + colormap = m.colors.ListedColormap(cmap.colors[::-1]) + else: + # todo: sort arch and variant by popularity, not name! df = dataframe[startdate:].div( dataframe.sum(axis=1), axis=0)*100 - kind = 'bar' colormap = cmap - # Start the actual graph - graph = df.plot(figsize=config['figsize'], - colormap=colormap, kind=kind, stacked=True) + # Start the actual graph + graph = df.plot(figsize=config['figsize'], + colormap=colormap, kind=kind, stacked=stacked) - # Labels and titles and stuff. - ax = plt.gca() + # Labels and titles and stuff. + ax = plt.gca() - handles, labels = ax.get_legend_handles_labels() + handles, labels = ax.get_legend_handles_labels() - # TODO: generalize this - if dataseries == 'age': - labels = list(map(config['age_labels'].get, labels)) + # TODO: generalize this + if dataseries == 'age': + labels = list(map(config['age_labels'].get, labels)) - if view == 'stacked': - handles[:] = handles[::-1] - labels[:] = labels[::-1] + if params['view'] == 'stacked': + handles[:] = handles[::-1] + labels[:] = labels[::-1] - plt.legend(handles, labels, loc='center left', - bbox_to_anchor=(1.0, 0.5)) + plt.legend(handles, labels, loc='center left', + bbox_to_anchor=(1.0, 0.5)) - madlibs = {'dataseries': dataseries, - 'dataset': dataset, - 'view': view, - 'dataseries_label': config['dataseries_labels'][dataseries], - 'dataset_label': config['dataset_labels'][dataset], - 'view_label': config['view_labels'][view]} + madlibs = {'dataseries': dataseries, + 'dataset': dataset, + 'view': params['view'], + 'dataseries_label': config['dataseries_labels'][dataseries], + 'dataset_label': config['dataset_labels'][dataset], + 'view_label': config['view_labels'][params['view']]} - if 'title' in params: - plt.suptitle(Template(params['title']).safe_substitute(madlibs), - fontsize=24) + if 'title' in params: + plt.suptitle(Template(params['title']).safe_substitute(madlibs), + fontsize=24) - # FIX: make work - if 'subtitle' in params: - graph.set_title( - Template(params['subtitle']).safe_substitute(madlibs), - fontsize=14) + # FIX: make work + if 'subtitle' in params: + graph.set_title( + Template(params['subtitle']).safe_substitute(madlibs), + fontsize=14) - plt.autoscale(enable=True, axis='x', tight=True) - plt.autoscale(enable=True, axis='y', tight=False) - graph.set_ylim([0, None]) - graph.spines['right'].set_visible(False) - graph.spines['top'].set_visible(False) - sFormatter = m.ticker.ScalarFormatter() - sFormatter.set_scientific(False) - graph.yaxis.set_major_formatter(sFormatter) - # graph.xaxis.set_major_formatter(dates.DateFormatter('%b \'%y')) - graph.set_xlabel('') + plt.autoscale(enable=True, axis='x', tight=True) + plt.autoscale(enable=True, axis='y', tight=False) + graph.set_ylim([0, None]) + graph.spines['right'].set_visible(False) + graph.spines['top'].set_visible(False) + sFormatter = m.ticker.ScalarFormatter() + sFormatter.set_scientific(False) + graph.yaxis.set_major_formatter(sFormatter) + # graph.xaxis.set_major_formatter(dates.DateFormatter('%b \'%y')) + graph.set_xlabel('') - for ext in config['image_types']: - graph.figure.savefig(f"images/{ext}/{dataset}/{Template(params['filebase']).safe_substitute(madlibs)}.{ext}", - dpi=config['dpi'], bbox_inches="tight") + for ext in config['image_types']: + graph.figure.savefig(f"images/{ext}/{dataset}/{Template(params['filebase']).safe_substitute(madlibs)}.{ext}", + dpi=config['dpi'], bbox_inches="tight") - plt.close(graph.figure) - print(f"{Template(params['filebase']).safe_substitute(madlibs)}.{ext}") + plt.close(graph.figure) + print(f"{Template(params['filebase']).safe_substitute(madlibs)}.{ext}") ########################################### @@ -236,64 +164,81 @@ def graph_releasebars(config, colormappings, params, dataframe): def main(): - config = toml.load("config.toml") + arguments = docopt(__doc__, version='0.1') + + config = toml.load("config.toml") colormappings = load_color_cache(config['color_cache'],config['color_presets']) database = sqlite3.connect(DATAFILE, detect_types=sqlite3.PARSE_DECLTYPES) + cur = database.cursor() + cur.execute("SELECT DATE(week, '6 days') FROM checkins ORDER BY week DESC LIMIT 1") + timestamp = cur.fetchone() + print(f"TIME: {timestamp}") + + #pprint(arguments) + if 'timeseries' in config: for timeseries in config['timeseries']: params = config['timeseries_defaults'].copy() params.update(timeseries) - - query = f"""SELECT week, {params['dataseries']}, SUM(hits) as hits - FROM checkins - WHERE dataset =\"{params['dataset']}\" - {params['extraselect']} - GROUP BY week, {params['dataseries']} - ORDER BY week - """ - df = pd.read_sql_query(query, parse_dates='week', con=database) - - graph_timeseries( - config=config, - colormappings=colormappings, - params=params, - dataframe=df.pivot(index='week', columns=params['dataseries'], - values='hits').astype("Int64"), - ) + params['type'] = 'timeseries' + + for view in params['views']: + params['view']=view + + query = f"""SELECT week, {params['dataseries']}, SUM(hits) as hits + FROM checkins + WHERE dataset =\"{params['dataset']}\" + {params['extraselect']} + GROUP BY week, {params['dataseries']} + ORDER BY week + """ + df = pd.read_sql_query(query, parse_dates='week', con=database) + + graph( + config=config, + colormappings=colormappings, + params=params, + dataframe=df.pivot(index='week', columns=params['dataseries'], + values='hits').astype("Int64"), + ) if 'byrelease' in config: for byrelease in config['byrelease']: params = config['byrelease_defaults'].copy() params.update(byrelease) - - query = f"""SELECT release, {params['dataseries']}, SUM(hits) as hits - FROM peak - WHERE dataset =\"{params['dataset']}\" - {params['extraselect']} - GROUP BY release, {params['dataseries']} - ORDER BY release - """ - df = pd.read_sql_query(query, con=database) - - graph_releasebars( - config=config, - colormappings=colormappings, - params=params, - dataframe=df.pivot(index='release', columns=params['dataseries'], - values='hits').astype("Int64"), - ) - - -''' -### getting ahead of myself: this is for the waffle charts -query = """SELECT * from checkins where week >= (SELECT DISTINCT(week) FROM checkins ORDER BY week DESC LIMIT 4,1)""" -df = pd.read_sql_query(query, parse_dates='week', con=database) -df -''' + params['type'] = 'byrelease' + + for view in params['views']: + params['view']=view + + query = f"""SELECT release, {params['dataseries']}, SUM(hits) as hits + FROM peak + WHERE dataset =\"{params['dataset']}\" + {params['extraselect']} + GROUP BY release, {params['dataseries']} + ORDER BY release + """ + df = pd.read_sql_query(query, con=database) + + graph( + config=config, + colormappings=colormappings, + params=params, + dataframe=df.pivot(index='release', columns=params['dataseries'], + values='hits').astype("Int64"), + ) + + + ''' + ### getting ahead of myself: this is for the waffle charts + query = """SELECT * from checkins where week >= (SELECT DISTINCT(week) FROM checkins ORDER BY week DESC LIMIT 4,1)""" + df = pd.read_sql_query(query, parse_dates='week', con=database) + df + ''' if __name__ == "__main__": main() diff --git a/config.toml b/config.toml index 01163b4..2a52e6c 100644 --- a/config.toml +++ b/config.toml @@ -3,7 +3,7 @@ datafile = "db/bronto.db" color_presets = "color-presets.toml" color_cache = "db/color-cache.toml" -ephemeral = "all" +imagepath="images/$filetype/$dataset/" figsize = [16, 9] dpi = 300 @@ -67,8 +67,8 @@ age="age category" [timeseries_defaults] title="$dataset_label: weekly checkins by $dataseries_label$view_label" -filebase="$dataset-timeseries-$dataseries-$view" extraselect="" +filebase="$dataset-timeseries-$dataseries-$view" # not all of these are implemented. But we could have... #subtitle= #dataset= @@ -78,6 +78,14 @@ extraselect="" # todo: back to the idea of reading these # from individual, merged configuration files! +[byrelease_defaults] +title="$dataset_label: $dataseries_label by release" +subtitle="data for each release taken from the week of that release's (current) peak" +filebase="$dataset-byrelease-$dataseries-$view" +extraselect="" +views=['stacked','share'] + + [[timeseries]] dataset="fedora_updates_systems" dataseries="release" @@ -440,13 +448,6 @@ views=['line','share'] extraselect="AND age>0" filebase="$dataset-timeseries-$dataseries-$view-persistent" -[byrelease_defaults] -title="$dataset_label: $dataseries_label by release" -subtitle="data for each release taken from the week of that release's (current) peak" -filebase="$dataset-byrelease-$dataseries-$view" -extraselect="" -views=['stacked','share'] - [[byrelease]] dataset="fedora_updates_systems" dataseries="age" From 926ddf1ebf99c93a02bc997680c93fca860cbff7 Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 24 2022 19:09:29 +0000 Subject: [PATCH 21/49] reorg --- diff --git a/brontosaurus_colorizer.py b/brontosaurus_colorizer.py index c39cb4f..69b18eb 100755 --- a/brontosaurus_colorizer.py +++ b/brontosaurus_colorizer.py @@ -8,65 +8,18 @@ This is saved to from pprint import pprint import sqlite3 -from collections import defaultdict -from collections import OrderedDict - import toml import re -from deepmerge import always_merger - - -def get_colors(colormappings, colorlist, dataset, dataseries, items): - """This makes colors 'sticky' for the whole run.""" - - key = dataset + '.' + dataseries - - # for each label item, assign the next color in the colorlist - # and save that for later. - outcolors = [] - for item in items: - if str(item) not in colormappings[key]: - print(f"Miss {item} in {key}") - colormappings[key][str(item)] = colorlist[len(colormappings[key]) % len(colorlist)] - outcolors.append(colormappings[key][str(item)]) - - return outcolors - - -def load_color_cache(cachefile,presetfile): - - colormappings = defaultdict(OrderedDict) - try: - cached = toml.load(cachefile) - except FileNotFoundError: - print(f"Can't open color cache {cachefile}, so starting fresh.") - cached = {} - - try: - presets = toml.load(presetfile) - always_merger.merge(cached,presets) - except FileNotFoundError: - print(f"No color preset file {presetfile} found.") - - - # gotta do this because we want a defaultdict but - # toml load just gives us a regular dict. - for key in cached.keys(): - # and this because we want the item keys to be strings - # even if they look like integers - for (item,color) in cached[key].items(): - colormappings[key][str(item)] = cached[key][item] - - return colormappings +from brontosaurusifier_utils import colormapping def main(): config = toml.load("config.toml") - colormappings = load_color_cache(config['color_cache'],config['color_presets']) + colormappings = colormapping.load_color_cache(config['color_cache'],config['color_presets']) database = sqlite3.connect(config['datafile'], detect_types=sqlite3.PARSE_DECLTYPES) cur = database.cursor() @@ -107,7 +60,7 @@ def main(): items = [t[0] for t in cur.fetchall()] #print(dataset,dataseries,items) - get_colors(colormappings, config['colors'], dataset, dataseries, items) + colormapping.get_colors(colormappings, config['colors'], dataset, dataseries, items) with open(config['color_cache'], "w") as toml_file: diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py index 46d972c..e382d8a 100755 --- a/brontosaurus_plotter.py +++ b/brontosaurus_plotter.py @@ -13,7 +13,7 @@ Options: --cutoff Drop items where the dataseries has less than n total hits """ -import matplotlib.dates as dates +#import matplotlib.dates as dates import matplotlib.pyplot as plt import sqlite3 @@ -28,7 +28,7 @@ import matplotlib as m from docopt import docopt -from brontosaurus_colorizer import load_color_cache, get_colors +from brontosaurusifier_utils import colormapping DATAFILE = 'db/bronto.db' @@ -65,7 +65,7 @@ def graph(config, colormappings, params, dataframe): # our colors. # the complication here is keeping the same color for the same label # across multiple graphs! - cmap = m.colors.ListedColormap(get_colors(colormappings, config['colors'], + cmap = m.colors.ListedColormap(colormapping.get_colors(colormappings, config['colors'], dataset, dataseries, list(dataframe.columns))) ################## @@ -169,7 +169,7 @@ def main(): config = toml.load("config.toml") - colormappings = load_color_cache(config['color_cache'],config['color_presets']) + colormappings = colormapping.load_color_cache(config['color_cache'],config['color_presets']) database = sqlite3.connect(DATAFILE, detect_types=sqlite3.PARSE_DECLTYPES) diff --git a/brontosaurusifier_utils/__init__.py b/brontosaurusifier_utils/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/brontosaurusifier_utils/__init__.py diff --git a/brontosaurusifier_utils/colormapping.py b/brontosaurusifier_utils/colormapping.py new file mode 100644 index 0000000..c338a07 --- /dev/null +++ b/brontosaurusifier_utils/colormapping.py @@ -0,0 +1,52 @@ + + +from collections import defaultdict +from collections import OrderedDict + + +import toml + + +from deepmerge import always_merger + +def get_colors(colormappings, colorlist, dataset, dataseries, items): + """This makes colors 'sticky' for the whole run.""" + + key = dataset + '.' + dataseries + + # for each label item, assign the next color in the colorlist + # and save that for later. + outcolors = [] + for item in items: + if str(item) not in colormappings[key]: + colormappings[key][str(item)] = colorlist[len(colormappings[key]) % len(colorlist)] + outcolors.append(colormappings[key][str(item)]) + + return outcolors + + +def load_color_cache(cachefile,presetfile): + + colormappings = defaultdict(OrderedDict) + try: + cached = toml.load(cachefile) + except FileNotFoundError: + print(f"Can't open color cache {cachefile}, so starting fresh.") + cached = {} + + try: + presets = toml.load(presetfile) + always_merger.merge(cached,presets) + except FileNotFoundError: + print(f"No color preset file {presetfile} found.") + + + # gotta do this because we want a defaultdict but + # toml load just gives us a regular dict. + for key in cached.keys(): + # and this because we want the item keys to be strings + # even if they look like integers + for (item,color) in cached[key].items(): + colormappings[key][str(item)] = cached[key][item] + + return colormappings \ No newline at end of file From dbcab8e0f446594886b1895257ee13f3e2b9add3 Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 24 2022 20:12:55 +0000 Subject: [PATCH 22/49] wip --- diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py index e382d8a..d60062e 100755 --- a/brontosaurus_plotter.py +++ b/brontosaurus_plotter.py @@ -3,9 +3,8 @@ Brontosaurus Plotter Usage: - brontosaurus_plotter.py timeseries ( line | stacked | share ) [--exclude ...|--include ...] [--cutoff ] - brontosaurus_plotter.py byrelease ( stacked | split ) [--exclude ...|--include ...] [--cutoff ] - brontosaurus_plotter.py show + brontosaurus_plotter.py timeseries ( text | stacked | share | line ) [--exclude ...|--include ...] [--cutoff ] + brontosaurus_plotter.py releasebars ( text | stacked | share | split ) [--exclude ...|--include ...] [--cutoff ] Options: --exclude ... In the form `dataseries:element`. Can repeat. @@ -24,6 +23,8 @@ from pprint import pprint import pandas as pd import toml +import re + import matplotlib as m from docopt import docopt @@ -44,7 +45,7 @@ m.rcParams['legend.frameon'] = False -def graph(config, colormappings, params, dataframe): +def draw_graph(config, colormappings, params, dataframe): # If we find we have missing data, in the future: # dataframe.resample('W-MON') @@ -75,19 +76,15 @@ def graph(config, colormappings, params, dataframe): startdate = config['startdate'][dataset.split('_', 1)[0]] stacked = True + subplots = False match params['type']: - case 'byrelease': + case 'releasebars': kind = 'bar' case 'timeseries': kind = 'area' - match params['view']: - case 'line': - df = dataframe[startdate:] - kind = 'line' # overrides 'area' - stacked = False # true everywhere else! - colormap = cmap + match params['graph']: case 'stacked': df = dataframe[startdate:][dataframe.columns[::-1]] colormap = m.colors.ListedColormap(cmap.colors[::-1]) @@ -102,10 +99,26 @@ def graph(config, colormappings, params, dataframe): df = dataframe[startdate:].div( dataframe.sum(axis=1), axis=0)*100 colormap = cmap + case 'line': + """ This is timeseries-only. """ + df = dataframe[startdate:] + kind = 'line' # overrides 'area' + stacked = False # true everywhere else! + colormap = cmap + case 'split': + """ This is releasebars-only. """ + df = dataframe[startdate:] + colormap = cmap + subplots = True + + # Start the actual graph graph = df.plot(figsize=config['figsize'], - colormap=colormap, kind=kind, stacked=stacked) + colormap=colormap, + kind=kind, + stacked=stacked, + subplots=subplots) # Labels and titles and stuff. ax = plt.gca() @@ -176,69 +189,87 @@ def main(): cur = database.cursor() cur.execute("SELECT DATE(week, '6 days') FROM checkins ORDER BY week DESC LIMIT 1") timestamp = cur.fetchone() - print(f"TIME: {timestamp}") - - #pprint(arguments) - - if 'timeseries' in config: - for timeseries in config['timeseries']: - params = config['timeseries_defaults'].copy() - params.update(timeseries) - params['type'] = 'timeseries' - - for view in params['views']: - params['view']=view - - query = f"""SELECT week, {params['dataseries']}, SUM(hits) as hits - FROM checkins - WHERE dataset =\"{params['dataset']}\" - {params['extraselect']} - GROUP BY week, {params['dataseries']} - ORDER BY week - """ - df = pd.read_sql_query(query, parse_dates='week', con=database) - - graph( - config=config, - colormappings=colormappings, - params=params, - dataframe=df.pivot(index='week', columns=params['dataseries'], - values='hits').astype("Int64"), - ) - if 'byrelease' in config: - for byrelease in config['byrelease']: - params = config['byrelease_defaults'].copy() - params.update(byrelease) - params['type'] = 'byrelease' - - for view in params['views']: - params['view']=view - - query = f"""SELECT release, {params['dataseries']}, SUM(hits) as hits - FROM peak - WHERE dataset =\"{params['dataset']}\" - {params['extraselect']} - GROUP BY release, {params['dataseries']} - ORDER BY release - """ - df = pd.read_sql_query(query, con=database) - - graph( - config=config, - colormappings=colormappings, - params=params, - dataframe=df.pivot(index='release', columns=params['dataseries'], + params = config['graph_defaults'].copy() + params['timestamp'] = timestamp + + cur.execute("SELECT dataset FROM checkins GROUP BY dataset") + datasets = [t[0] for t in cur.fetchall()] + cur.execute("SELECT * FROM checkins LIMIT 1").fetchall() + dataserieses = [t[0] for t in cur.description] + dataserieses.remove('week') + dataserieses.remove('dataset') + dataserieses.remove('hits') + + pprint(arguments) + + + dataset=arguments[''] + dataseries=arguments[''] + + if not re.match('^[0-9a-z_]*$', dataset): + print(f"Bad dataset name! '%{dataset}") + exit(1) + if not dataset in datasets: + print(f"Dataset '%{dataset}' not in database.") + exit(1) + if not re.match('^[0-9a-z_]*$', dataseries): + print(f"Bad dataseries name! '%{dataseries}") + exit(1) + if not dataseries in dataserieses: + print(f"Dataseries '%{dataseries}' not in database.") + exit(1) + + + + if arguments['timeseries']: + params['type'] = 'timeseries' + elif arguments['releasebars']: + params['type'] = 'releasebars' + # TODO: waffle! + # read in defaults from config.toml + params.update(config[params['type']]) + + table = params['table'] + + + # maybe docopt isn't the best choice. oh well. + for graphtype in [ 'text', 'stacked', 'share', 'split', 'line' ]: + if arguments[graphtype]: + params['graph']=graphtype + break + + + if params['graph'] == 'text': + query = f"""SELECT {dataseries},sum(hits) AS total + FROM {table} + WHERE dataset = '{dataset}' + GROUP BY {dataseries} + ORDER BY total + ASC + """ + cur.execute(query) + # TODO: add title here + for (item,hits) in cur: + print(f"{hits:-10} — {item:40}") + else: + xaxis = params['xaxis'] + + query = f"""SELECT {xaxis}, {dataseries}, SUM(hits) as hits + FROM {table} + WHERE dataset =\"{dataset}\" + GROUP BY {xaxis}, {params['dataseries']} + ORDER BY {xaxis} + """ + df = pd.read_sql_query(query, parse_dates='week', con=database) + + draw_graph(config=config, + colormappings=colormappings, + params=params, + dataframe=df.pivot(index='week', columns=params['dataseries'], values='hits').astype("Int64"), ) - ''' - ### getting ahead of myself: this is for the waffle charts - query = """SELECT * from checkins where week >= (SELECT DISTINCT(week) FROM checkins ORDER BY week DESC LIMIT 4,1)""" - df = pd.read_sql_query(query, parse_dates='week', con=database) - df - ''' - if __name__ == "__main__": main() diff --git a/config.toml b/config.toml index 2a52e6c..b65d634 100644 --- a/config.toml +++ b/config.toml @@ -65,474 +65,19 @@ age="age category" 'stacked'=" (stacked)" 'share'=" (share)" -[timeseries_defaults] -title="$dataset_label: weekly checkins by $dataseries_label$view_label" +[graph_defaults] +table="checkins" +title="$dataset_label: $dataseries_label $type_label" +# TODO: list options! + +[timeseries] +subtitle="weekly checkins" +xaxis = "week" extraselect="" filebase="$dataset-timeseries-$dataseries-$view" -# not all of these are implemented. But we could have... -#subtitle= -#dataset= -#dataseries= -#orderbyhits= -#reverse= -# todo: back to the idea of reading these -# from individual, merged configuration files! -[byrelease_defaults] -title="$dataset_label: $dataseries_label by release" +[releasebars] +table="peak" +xaxis = "release" subtitle="data for each release taken from the week of that release's (current) peak" -filebase="$dataset-byrelease-$dataseries-$view" -extraselect="" -views=['stacked','share'] - - -[[timeseries]] -dataset="fedora_updates_systems" -dataseries="release" -views=['line','stacked','share'] -filebase="$dataset-timeseries-$dataseries-$view-all" - -[[timeseries]] -subtitle="ephemeral systems" -dataset="fedora_updates_systems" -dataseries="release" -views=['line','stacked','share'] -extraselect="AND age=0" -filebase="$dataset-timeseries-$dataseries-$view-ephemeral" - -[[timeseries]] -subtitle="persistent systems" -dataset="fedora_updates_systems" -dataseries="release" -views=['line','stacked','share'] -extraselect="AND age>0" -filebase="$dataset-timeseries-$dataseries-$view-persistent" - -[[timeseries]] -dataset="fedora_updates_systems" -dataseries="age" -views=['share','stacked'] -filebase="$dataset-timeseries-$dataseries-$view-all" - -[[timeseries]] -dataset="fedora_updates_systems" -dataseries="arch" -views=['line','share'] -filebase="$dataset-timeseries-$dataseries-$view-all" - -[[timeseries]] -subtitle="ephemeral systems" -dataset="fedora_updates_systems" -dataseries="arch" -views=['line','share'] -extraselect="AND age=0" -filebase="$dataset-timeseries-$dataseries-$view-ephemeral" - -[[timeseries]] -subtitle="persistent systems" -dataset="fedora_updates_systems" -dataseries="arch" -views=['line','share'] -extraselect="AND age>0" -filebase="$dataset-timeseries-$dataseries-$view-persistent" - -[[timeseries]] -dataset="fedora_updates_systems" -dataseries="variant" -views=['line','share'] -filebase="$dataset-timeseries-$dataseries-$view-all" - -[[timeseries]] -subtitle="ephemeral systems" -dataset="fedora_updates_systems" -dataseries="variant" -views=['line','share'] -extraselect="AND age=0" -filebase="$dataset-timeseries-$dataseries-$view-ephemeral" - -[[timeseries]] -subtitle="persistent systems" -dataset="fedora_updates_systems" -dataseries="variant" -views=['line','share'] -extraselect="AND age>0" -filebase="$dataset-timeseries-$dataseries-$view-persistent" - -[[timeseries]] -dataset="fedora_updates_containers" -dataseries="release" -views=['line','stacked','share'] -filebase="$dataset-timeseries-$dataseries-$view-all" - -[[timeseries]] -subtitle="ephemeral systems" -dataset="fedora_updates_containers" -dataseries="release" -views=['line','stacked','share'] -extraselect="AND age=0" -filebase="$dataset-timeseries-$dataseries-$view-ephemeral" - -[[timeseries]] -subtitle="persistent systems" -dataset="fedora_updates_containers" -dataseries="release" -views=['line','stacked','share'] -extraselect="AND age>0" -filebase="$dataset-timeseries-$dataseries-$view-persistent" - -[[timeseries]] -dataset="fedora_updates_containers" -dataseries="age" -views=['share','stacked'] -filebase="$dataset-timeseries-$dataseries-$view-all" - -[[timeseries]] -dataset="fedora_updates_containers" -dataseries="arch" -views=['line','share'] -filebase="$dataset-timeseries-$dataseries-$view-all" - -[[timeseries]] -subtitle="ephemeral systems" -dataset="fedora_updates_containers" -dataseries="arch" -views=['line','share'] -extraselect="AND age=0" -filebase="$dataset-timeseries-$dataseries-$view-ephemeral" - -[[timeseries]] -subtitle="persistent systems" -dataset="fedora_updates_containers" -dataseries="arch" -views=['line','share'] -extraselect="AND age>0" -filebase="$dataset-timeseries-$dataseries-$view-persistent" - -[[timeseries]] -dataset="fedora_updates_containers" -dataseries="variant" -views=['line','share'] -filebase="$dataset-timeseries-$dataseries-$view-all" - -[[timeseries]] -subtitle="ephemeral systems" -dataset="fedora_updates_containers" -dataseries="variant" -views=['line','share'] -extraselect="AND age=0" -filebase="$dataset-timeseries-$dataseries-$view-ephemeral" - -[[timeseries]] -subtitle="persistent systems" -dataset="fedora_updates_containers" -dataseries="variant" -views=['line','share'] -extraselect="AND age>0" -filebase="$dataset-timeseries-$dataseries-$view-persistent" - - -[[timeseries]] -dataset="fedora_rawhide_systems" -dataseries="release" -views=['line','stacked','share'] -filebase="$dataset-timeseries-$dataseries-$view-all" - -[[timeseries]] -subtitle="ephemeral systems" -dataset="fedora_rawhide_systems" -dataseries="release" -views=['line','stacked','share'] -extraselect="AND age=0" -filebase="$dataset-timeseries-$dataseries-$view-ephemeral" - -[[timeseries]] -subtitle="persistent systems" -dataset="fedora_rawhide_systems" -dataseries="release" -views=['line','stacked','share'] -extraselect="AND age>0" -filebase="$dataset-timeseries-$dataseries-$view-persistent" - -[[timeseries]] -dataset="fedora_rawhide_systems" -dataseries="age" -views=['share','stacked'] - -[[timeseries]] -dataset="fedora_rawhide_systems" -dataseries="arch" -views=['line','share'] -filebase="$dataset-timeseries-$dataseries-$view-all" - -[[timeseries]] -subtitle="ephemeral systems" -dataset="fedora_rawhide_systems" -dataseries="arch" -views=['line','share'] -extraselect="AND age=0" -filebase="$dataset-timeseries-$dataseries-$view-ephemeral" - -[[timeseries]] -subtitle="persistent systems" -dataset="fedora_rawhide_systems" -dataseries="arch" -views=['line','share'] -extraselect="AND age>0" -filebase="$dataset-timeseries-$dataseries-$view-persistent" - -[[timeseries]] -dataset="fedora_rawhide_systems" -dataseries="variant" -views=['line','share'] -filebase="$dataset-timeseries-$dataseries-$view-all" - -[[timeseries]] -subtitle="ephemeral systems" -dataset="fedora_rawhide_systems" -dataseries="variant" -views=['line','share'] -extraselect="AND age=0" -filebase="$dataset-timeseries-$dataseries-$view-ephemeral" - -[[timeseries]] -subtitle="persistent systems" -dataset="fedora_rawhide_systems" -dataseries="variant" -views=['line','share'] -extraselect="AND age>0" -filebase="$dataset-timeseries-$dataseries-$view-persistent" - - -[[timeseries]] -dataset="fedora_rawhide_containers" -dataseries="release" -views=['line','stacked','share'] -filebase="$dataset-timeseries-$dataseries-$view-all" - -[[timeseries]] -subtitle="ephemeral systems" -dataset="fedora_rawhide_containers" -dataseries="release" -views=['line','stacked','share'] -extraselect="AND age=0" -filebase="$dataset-timeseries-$dataseries-$view-ephemeral" - -[[timeseries]] -subtitle="persistent systems" -dataset="fedora_rawhide_containers" -dataseries="release" -views=['line','stacked','share'] -extraselect="AND age>0" -filebase="$dataset-timeseries-$dataseries-$view-persistent" - -[[timeseries]] -dataset="fedora_rawhide_containers" -dataseries="age" -views=['share','stacked'] -filebase="$dataset-timeseries-$dataseries-$view-all" - -[[timeseries]] -dataset="fedora_rawhide_containers" -dataseries="arch" -views=['line','share'] -filebase="$dataset-timeseries-$dataseries-$view-all" - -[[timeseries]] -subtitle="ephemeral systems" -dataset="fedora_rawhide_containers" -dataseries="arch" -views=['line','share'] -extraselect="AND age=0" -filebase="$dataset-timeseries-$dataseries-$view-ephemeral" - -[[timeseries]] -subtitle="persistent systems" -dataset="fedora_rawhide_containers" -dataseries="arch" -views=['line','share'] -extraselect="AND age>0" -filebase="$dataset-timeseries-$dataseries-$view-persistent" - -[[timeseries]] -dataset="fedora_rawhide_containers" -dataseries="variant" -views=['line','share'] -filebase="$dataset-timeseries-$dataseries-$view-all" - -[[timeseries]] -subtitle="ephemeral systems" -dataset="fedora_rawhide_containers" -dataseries="variant" -views=['line','share'] -extraselect="AND age=0" -filebase="$dataset-timeseries-$dataseries-$view-ephemeral" - -[[timeseries]] -subtitle="persistent systems" -dataset="fedora_rawhide_containers" -dataseries="variant" -views=['line','share'] -extraselect="AND age>0" -filebase="$dataset-timeseries-$dataseries-$view-persistent" - - -[[timeseries]] -dataset="epel" -dataseries="release" -views=['line','stacked','share'] -filebase="$dataset-timeseries-$dataseries-$view-all" - -[[timeseries]] -subtitle="ephemeral systems" -dataset="epel" -dataseries="release" -views=['line','stacked','share'] -extraselect="AND age=0" -filebase="$dataset-timeseries-$dataseries-$view-ephemeral" - -[[timeseries]] -subtitle="persistent systems" -dataset="epel" -dataseries="release" -views=['line','stacked','share'] -extraselect="AND age>0" -filebase="$dataset-timeseries-$dataseries-$view-persistent" - -[[timeseries]] -dataset="epel" -dataseries="age" -views=['share','stacked'] -filebase="$dataset-timeseries-$dataseries-$view-all" - -[[timeseries]] -dataset="epel" -dataseries="arch" -views=['line','share'] -filebase="$dataset-timeseries-$dataseries-$view-all" - -[[timeseries]] -subtitle="ephemeral systems" -dataset="epel" -dataseries="arch" -views=['line','share'] -extraselect="AND age=0" -filebase="$dataset-timeseries-$dataseries-$view-ephemeral" - -[[timeseries]] -subtitle="persistent systems" -dataset="epel" -dataseries="arch" -views=['line','share'] -extraselect="AND age>0" -filebase="$dataset-timeseries-$dataseries-$view-persistent" - -[[timeseries]] -dataset="epel" -dataseries="variant" -views=['line','share'] -filebase="$dataset-timeseries-$dataseries-$view-all" - -[[timeseries]] -subtitle="ephemeral systems" -dataset="epel" -dataseries="variant" -views=['line','share'] -extraselect="AND age=0" -filebase="$dataset-timeseries-$dataseries-$view-ephemeral" - -[[timeseries]] -subtitle="persistent systems" -dataset="epel" -dataseries="variant" -views=['line','share'] -extraselect="AND age>0" -filebase="$dataset-timeseries-$dataseries-$view-persistent" - -[[byrelease]] -dataset="fedora_updates_systems" -dataseries="age" - -[[byrelease]] -dataset="fedora_updates_systems" -dataseries="arch" -filebase="$dataset-byrelease-$dataseries-$view-all" - -[[byrelease]] -dataset="fedora_updates_systems" -dataseries="arch" -extraselect="AND age=0" -filebase="$dataset-byrelease-$dataseries-$view-ephemeral" - -[[byrelease]] -dataset="fedora_updates_systems" -dataseries="arch" -extraselect="AND age>0" -filebase="$dataset-byrelease-$dataseries-$view-persistent" - -[[byrelease]] -dataset="fedora_updates_systems" -dataseries="variant" - -[[byrelease]] -dataset="fedora_updates_systems" -dataseries="variant" -extraselect="AND age=0" -filebase="$dataset-byrelease-$dataseries-$view-ephemeral" - -[[byrelease]] -dataset="fedora_updates_systems" -dataseries="variant" -extraselect="AND age>0" -filebase="$dataset-byrelease-$dataseries-$view-persistent" - -[[byrelease]] -dataset="fedora_updates_containers" -dataseries="age" - -[[byrelease]] -dataset="fedora_updates_containers" -dataseries="arch" - -[[byrelease]] -dataset="fedora_updates_containers" -dataseries="variant" - - -[[byrelease]] -dataset="fedora_rawhide_systems" -dataseries="age" - -[[byrelease]] -dataset="fedora_rawhide_systems" -dataseries="arch" - -[[byrelease]] -dataset="fedora_rawhide_systems" -dataseries="variant" - - -[[byrelease]] -dataset="fedora_rawhide_containers" -dataseries="age" - -[[byrelease]] -dataset="fedora_rawhide_containers" -dataseries="arch" - -[[byrelease]] -dataset="fedora_rawhide_containers" -dataseries="variant" - - -[[byrelease]] -dataset="epel" -dataseries="age" - -[[byrelease]] -dataset="epel" -dataseries="arch" - -[[byrelease]] -dataset="epel" -dataseries="variant" From 1934955a652347e3860f50f862101e4e8491fbe4 Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 24 2022 23:24:00 +0000 Subject: [PATCH 23/49] ahahaa --- diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py index d60062e..17bf045 100755 --- a/brontosaurus_plotter.py +++ b/brontosaurus_plotter.py @@ -44,22 +44,113 @@ m.rcParams['legend.frameon'] = False +def main(): + + arguments = docopt(__doc__, version='0.1') + + + config = toml.load("config.toml") + + colormappings = colormapping.load_color_cache(config['color_cache'],config['color_presets']) + + database = sqlite3.connect(DATAFILE, detect_types=sqlite3.PARSE_DECLTYPES) + + cur = database.cursor() + cur.execute("SELECT DATE(week, '6 days') FROM checkins ORDER BY week DESC LIMIT 1") + timestamp = cur.fetchone() + + params = config['graph_defaults'].copy() + params['timestamp'] = timestamp + + cur.execute("SELECT dataset FROM checkins GROUP BY dataset") + datasets = [t[0] for t in cur.fetchall()] + cur.execute("SELECT * FROM checkins LIMIT 1").fetchall() + dataserieses = [t[0] for t in cur.description] + dataserieses.remove('week') + dataserieses.remove('dataset') + dataserieses.remove('hits') + + #pprint(arguments) + + + dataset=arguments[''] + dataseries=arguments[''] + + if not re.match('^[0-9a-z_]*$', dataset): + print(f"Bad dataset name! '%{dataset}") + exit(1) + if not dataset in datasets: + print(f"Dataset '%{dataset}' not in database.") + exit(1) + if not re.match('^[0-9a-z_]*$', dataseries): + print(f"Bad dataseries name! '%{dataseries}") + exit(1) + if not dataseries in dataserieses: + print(f"Dataseries '%{dataseries}' not in database.") + exit(1) + + startdate = config['startdate'][dataset] + + + if arguments['timeseries']: + params['type'] = 'timeseries' + elif arguments['releasebars']: + params['type'] = 'releasebars' + # TODO: waffle! + + # read in defaults from config.toml + params.update(config[params['type']]) + + table = params['table'] + + + # maybe docopt isn't the best choice. oh well. + for graphtype in [ 'text', 'stacked', 'share', 'split', 'line' ]: + if arguments[graphtype]: + params['graph']=graphtype + break + -def draw_graph(config, colormappings, params, dataframe): + if params['graph'] == 'text': + query = f"""SELECT {dataseries},sum(hits) AS total + FROM {table} + WHERE dataset = '{dataset}' + GROUP BY {dataseries} + ORDER BY total + ASC + """ + cur.execute(query) + # TODO: add title here + for (item,hits) in cur: + print(f"{hits:-10} — {item:40}") + exit(0) - # If we find we have missing data, in the future: - # dataframe.resample('W-MON') + xaxis = params['xaxis'] + + query = f"""SELECT {xaxis}, {dataseries}, SUM(hits) as hits + FROM {table} + WHERE dataset =\"{dataset}\" + GROUP BY {xaxis}, {dataseries} + ORDER BY {xaxis} + """ + df = pd.read_sql_query(query, parse_dates='week', con=database) + + + dataframe=df.pivot(index=xaxis, + columns=dataseries, + values='hits' + ).astype("Int64") - dataset = params['dataset'] - dataseries = params['dataseries'] + # Smooth over any missing data + #dataframe.resample('W-MON') + pprint(dataframe ) ################# # Instead of this, accumulate anything more than 10 into "other" # ... and do it _elsewhere_ (easier to do before pivot anyway!) # + limit number of columns to 10 + other - hidelist = dataframe.div(dataframe.sum( - axis=1), axis=0).max() < 0.2/100 + hidelist = dataframe.div(dataframe.sum(axis=1), axis=0).max() < 0.2/100 dataframe.drop(columns=(hidelist[hidelist == True].keys()), inplace=True) ################## @@ -72,8 +163,6 @@ def draw_graph(config, colormappings, params, dataframe): ################## # and now.... graph it! - # FIXME: this is ugly - startdate = config['startdate'][dataset.split('_', 1)[0]] stacked = True subplots = False @@ -129,7 +218,8 @@ def draw_graph(config, colormappings, params, dataframe): if dataseries == 'age': labels = list(map(config['age_labels'].get, labels)) - if params['view'] == 'stacked': + # hmmm + if params['graph'] == 'stacked' or params['graph'] == 'shared': handles[:] = handles[::-1] labels[:] = labels[::-1] @@ -138,10 +228,10 @@ def draw_graph(config, colormappings, params, dataframe): madlibs = {'dataseries': dataseries, 'dataset': dataset, - 'view': params['view'], + 'graph': params['graph'], 'dataseries_label': config['dataseries_labels'][dataseries], 'dataset_label': config['dataset_labels'][dataset], - 'view_label': config['view_labels'][params['view']]} + 'view_label': config['view_labels'][params['graph']]} if 'title' in params: plt.suptitle(Template(params['title']).safe_substitute(madlibs), @@ -172,104 +262,5 @@ def draw_graph(config, colormappings, params, dataframe): print(f"{Template(params['filebase']).safe_substitute(madlibs)}.{ext}") -########################################### - - -def main(): - - arguments = docopt(__doc__, version='0.1') - - - config = toml.load("config.toml") - - colormappings = colormapping.load_color_cache(config['color_cache'],config['color_presets']) - - database = sqlite3.connect(DATAFILE, detect_types=sqlite3.PARSE_DECLTYPES) - - cur = database.cursor() - cur.execute("SELECT DATE(week, '6 days') FROM checkins ORDER BY week DESC LIMIT 1") - timestamp = cur.fetchone() - - params = config['graph_defaults'].copy() - params['timestamp'] = timestamp - - cur.execute("SELECT dataset FROM checkins GROUP BY dataset") - datasets = [t[0] for t in cur.fetchall()] - cur.execute("SELECT * FROM checkins LIMIT 1").fetchall() - dataserieses = [t[0] for t in cur.description] - dataserieses.remove('week') - dataserieses.remove('dataset') - dataserieses.remove('hits') - - pprint(arguments) - - - dataset=arguments[''] - dataseries=arguments[''] - - if not re.match('^[0-9a-z_]*$', dataset): - print(f"Bad dataset name! '%{dataset}") - exit(1) - if not dataset in datasets: - print(f"Dataset '%{dataset}' not in database.") - exit(1) - if not re.match('^[0-9a-z_]*$', dataseries): - print(f"Bad dataseries name! '%{dataseries}") - exit(1) - if not dataseries in dataserieses: - print(f"Dataseries '%{dataseries}' not in database.") - exit(1) - - - - if arguments['timeseries']: - params['type'] = 'timeseries' - elif arguments['releasebars']: - params['type'] = 'releasebars' - # TODO: waffle! - # read in defaults from config.toml - params.update(config[params['type']]) - - table = params['table'] - - - # maybe docopt isn't the best choice. oh well. - for graphtype in [ 'text', 'stacked', 'share', 'split', 'line' ]: - if arguments[graphtype]: - params['graph']=graphtype - break - - - if params['graph'] == 'text': - query = f"""SELECT {dataseries},sum(hits) AS total - FROM {table} - WHERE dataset = '{dataset}' - GROUP BY {dataseries} - ORDER BY total - ASC - """ - cur.execute(query) - # TODO: add title here - for (item,hits) in cur: - print(f"{hits:-10} — {item:40}") - else: - xaxis = params['xaxis'] - - query = f"""SELECT {xaxis}, {dataseries}, SUM(hits) as hits - FROM {table} - WHERE dataset =\"{dataset}\" - GROUP BY {xaxis}, {params['dataseries']} - ORDER BY {xaxis} - """ - df = pd.read_sql_query(query, parse_dates='week', con=database) - - draw_graph(config=config, - colormappings=colormappings, - params=params, - dataframe=df.pivot(index='week', columns=params['dataseries'], - values='hits').astype("Int64"), - ) - - if __name__ == "__main__": main() diff --git a/config.toml b/config.toml index b65d634..6676546 100644 --- a/config.toml +++ b/config.toml @@ -37,8 +37,11 @@ colors = [ image_types = ["png"] [startdate] -fedora = '2021-01-01' # F32 release not fully captured, so start here. -epel = '2021-01-01' # DNF feature launched in 8.3 at end of 2020 +fedora_updates_systems = '2021-01-01' # F32 release not fully captured, so start here. +fedora_updates_containers = '2021-01-01' +fedora_rawhide_systems = '2021-01-01' +fedora_rawhide_containers = '2021-01-01' +epel = '2021-01-01' # DNF feature launched in 8.3 at end of 2020 [dataset_labels] epel = "Extra Packages for Enterprise Linux" @@ -68,13 +71,13 @@ age="age category" [graph_defaults] table="checkins" title="$dataset_label: $dataseries_label $type_label" +filebase="$dataset-timeseries-$dataseries-$graph" # TODO: list options! [timeseries] subtitle="weekly checkins" xaxis = "week" extraselect="" -filebase="$dataset-timeseries-$dataseries-$view" [releasebars] table="peak" From b60b300d7b1596532c9729bc42153dbcb2d8a017 Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 24 2022 23:49:49 +0000 Subject: [PATCH 24/49] hacky way to generate 'em all --- diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py index 17bf045..0b8121f 100755 --- a/brontosaurus_plotter.py +++ b/brontosaurus_plotter.py @@ -4,7 +4,7 @@ Brontosaurus Plotter Usage: brontosaurus_plotter.py timeseries ( text | stacked | share | line ) [--exclude ...|--include ...] [--cutoff ] - brontosaurus_plotter.py releasebars ( text | stacked | share | split ) [--exclude ...|--include ...] [--cutoff ] + brontosaurus_plotter.py releasebar ( text | stacked | share | split ) [--exclude ...|--include ...] [--cutoff ] Options: --exclude ... In the form `dataseries:element`. Can repeat. @@ -57,7 +57,7 @@ def main(): cur = database.cursor() cur.execute("SELECT DATE(week, '6 days') FROM checkins ORDER BY week DESC LIMIT 1") - timestamp = cur.fetchone() + timestamp = cur.fetchone()[0] params = config['graph_defaults'].copy() params['timestamp'] = timestamp @@ -94,15 +94,17 @@ def main(): if arguments['timeseries']: params['type'] = 'timeseries' - elif arguments['releasebars']: - params['type'] = 'releasebars' + elif arguments['releasebar']: + params['type'] = 'releasebar' # TODO: waffle! # read in defaults from config.toml params.update(config[params['type']]) - table = params['table'] + #pprint(params) + + table = params['table'] # maybe docopt isn't the best choice. oh well. for graphtype in [ 'text', 'stacked', 'share', 'split', 'line' ]: @@ -142,8 +144,9 @@ def main(): ).astype("Int64") # Smooth over any missing data - #dataframe.resample('W-MON') - pprint(dataframe ) + #if xaxis=='week': + # dataframe.resample('W-MON') + #pprint(dataframe) ################# # Instead of this, accumulate anything more than 10 into "other" @@ -168,7 +171,7 @@ def main(): subplots = False match params['type']: - case 'releasebars': + case 'releasebar': kind = 'bar' case 'timeseries': kind = 'area' @@ -195,7 +198,7 @@ def main(): stacked = False # true everywhere else! colormap = cmap case 'split': - """ This is releasebars-only. """ + """ This is releasebar-only. """ df = dataframe[startdate:] colormap = cmap subplots = True @@ -227,11 +230,16 @@ def main(): bbox_to_anchor=(1.0, 0.5)) madlibs = {'dataseries': dataseries, - 'dataset': dataset, - 'graph': params['graph'], - 'dataseries_label': config['dataseries_labels'][dataseries], - 'dataset_label': config['dataset_labels'][dataset], - 'view_label': config['view_labels'][params['graph']]} + 'dataset': dataset, + 'graph': params['graph'], + 'timestamp': timestamp, + 'type': params['type'], + 'extra': '', + 'dataseries_label': config['dataseries_labels'][dataseries], + 'dataset_label': config['dataset_labels'][dataset], + 'view_label': config['view_labels'][params['graph']], + 'type_label': params['label'], + } if 'title' in params: plt.suptitle(Template(params['title']).safe_substitute(madlibs), diff --git a/config.toml b/config.toml index 6676546..7e31ead 100644 --- a/config.toml +++ b/config.toml @@ -33,7 +33,6 @@ colors = [ ] # could be png, pdf, svg -# TODO: not yet implemented image_types = ["png"] [startdate] @@ -71,16 +70,20 @@ age="age category" [graph_defaults] table="checkins" title="$dataset_label: $dataseries_label $type_label" -filebase="$dataset-timeseries-$dataseries-$graph" -# TODO: list options! +filebase="$timestamp-$dataset-$type-$graph-$dataseries" +# TODO: list possible options! [timeseries] -subtitle="weekly checkins" +label="weekly checkins" +subtitle="" xaxis = "week" extraselect="" -[releasebars] +[releasebar] +label="by release" table="peak" xaxis = "release" subtitle="data for each release taken from the week of that release's (current) peak" +[waffleplot] +subtitle="tk!" \ No newline at end of file diff --git a/run.sh b/run.sh index a8d85a0..d150d1e 100755 --- a/run.sh +++ b/run.sh @@ -90,10 +90,21 @@ echo -n "* Painting the feathers..." echo " vibrant!" echo "* Drawing portraits from the fossilized remains... " - LINES=$(toml -dj < config.toml |jq '.timeseries[].views.value[].value'|wc -l) - ./brontosaurus_plotter.py | pv -F " %p %e" -w60 -l -s $LINES > /dev/null - if [[ $? != 0 ]]; then - echo "! Oops." - exit 1 - fi -echo " Beautiful." +LINES=$(toml -dj < config.toml |jq '.timeseries[].views.value[].value'|wc -l) + #./brontosaurus_plotter.py | pv -F " %p %e" -w60 -l -s $LINES > /dev/null + + #if [[ $? != 0 ]]; then + # echo "! Oops." + # exit 1 + # fi + for dataset in fedora_updates_systems epel; do + for dataseries in age arch release variant; do + for graph in stacked share line; do + ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries + done + for graph in stacked share split; do + ./brontosaurus_plotter.py releasebar $graph $dataset $dataseries + done + done + done + echo " Beautiful." From c0b265a80a169c4844860f88bba54fa997e05d22 Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 25 2022 00:11:32 +0000 Subject: [PATCH 25/49] basically works --- diff --git a/TODO.md b/TODO.md index 533cc45..03b7537 100644 --- a/TODO.md +++ b/TODO.md @@ -3,16 +3,25 @@ 1. Save the color mappings to a file as a separate step * using defaults from config (DONE) 2. change brontosaurus-plotter to render _one_ image per call + * first pass done * with a syntax for what to include or exclude by name * and possibly with some number options? - 3. make timeline, releasebar, and waffle be separate commands - 4. have some script that pre-renders some defaults + * add back ephemeral, or is that just a subset of the above? + 3. make timeline, releasebar, and waffle be separate commands (DONE) + 4. have some script that pre-renders some defaults (PARTIAL) 5. and a simple front-end for exploring the rest -* put the dataset date in the filename! +* map "generic" and "unknown" and "none" to "unspecified" + -* epel -- need to special-case EL 8 by-release graphs to add peak _after_ - CentOS Linux 8 EOL +* better ordering + +* epel -- need to special-case EL 8 by-release graphs to add peak for + both before and after CentOS Linux 8 EOL + +* fix it so colors don't overlap when there's more than 12 options. + Best bet: lump everything after 11 into "other". + * change the timeseries "hide" to collect small things into "other" * text reports!!! * this week / last week / year-over-year @@ -28,13 +37,11 @@ * architecture as above * share of category (desktop/server-cloud-iot/labs) -* use jinjasql for the query templates! +* use jinjasql for the query templates? * sanitize everything coming from config.toml, really. * for the slicer, put the groups in their definitions in the config.toml -* better ordering - * secondary timeline charts for variants: * variant variants! @@ -50,20 +57,7 @@ * Report estimating new installs vs upgrades (number of systems older than the release itself ... need to factor in beta releaes date, etc....) -* I guess we should make it so the timeseries definitions can loop over multiple datasets to avoid - a lot of redundancy. Or at least, to apply to all Fedora datasets? (Yes, that: introduce a "distro" grouping.) - - need a way to actually include multiple datasets at once though, like for the fedora linux + epel graph - -* predefined colors for some things - -* fix it so colors don't overlap when there's more than 12 options. - Best bet: lump everything after 11 into "other". - -* stacked bar charts for each release with age, arch, variant - - * these stacked bar charts should feature each release at its peak - point, not summed (because that's its most interesting!) - * don't bother with ephemeral/persistent view (age view is enough) +* need a way to include multiple datasets at once, like for the fedora linux + epel graph * sanatize all values read from config.toml @@ -77,10 +71,7 @@ * make animations by week of full [arch,variant,release] * maybe of the breakouts too? - -* change the timeseries "hide" to collect small things into "other" - * the "age" charts are most interesting on a by-week basis, but _per release_. Can we estimate the flow from release-to-release? (Answer: @@ -124,7 +115,6 @@ * fix the code in brotosaurus washer to merge '' to 'none' rather than just renaming (works now because there are no natural 'none' entries). -* map "generic" and "unknown" and "none" to "unspecified" * instead of throwing away entries in the washing phase (especially those below thresholds), write them to a special db for "fun" analysis diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py index 0b8121f..7621f9d 100755 --- a/brontosaurus_plotter.py +++ b/brontosaurus_plotter.py @@ -3,8 +3,8 @@ Brontosaurus Plotter Usage: - brontosaurus_plotter.py timeseries ( text | stacked | share | line ) [--exclude ...|--include ...] [--cutoff ] - brontosaurus_plotter.py releasebar ( text | stacked | share | split ) [--exclude ...|--include ...] [--cutoff ] + brontosaurus_plotter.py timeseries ( text | stacked | share | line ) [--exclude ...|--include ...] [--cutoff ] + brontosaurus_plotter.py releasebar ( text | stacked | share ) [--exclude ...|--include ...] [--cutoff ] Options: --exclude ... In the form `dataseries:element`. Can repeat. @@ -66,9 +66,10 @@ def main(): datasets = [t[0] for t in cur.fetchall()] cur.execute("SELECT * FROM checkins LIMIT 1").fetchall() dataserieses = [t[0] for t in cur.description] - dataserieses.remove('week') + dataserieses.remove('dataset') dataserieses.remove('hits') + dataserieses.remove('week') #pprint(arguments) @@ -77,16 +78,16 @@ def main(): dataseries=arguments[''] if not re.match('^[0-9a-z_]*$', dataset): - print(f"Bad dataset name! '%{dataset}") + print(f"Bad dataset name! '{dataset}") exit(1) if not dataset in datasets: - print(f"Dataset '%{dataset}' not in database.") + print(f"Dataset '{dataset}' not in database.") exit(1) if not re.match('^[0-9a-z_]*$', dataseries): - print(f"Bad dataseries name! '%{dataseries}") + print(f"Bad dataseries name! '{dataseries}") exit(1) if not dataseries in dataserieses: - print(f"Dataseries '%{dataseries}' not in database.") + print(f"Dataseries '{dataseries}' not in database.") exit(1) startdate = config['startdate'][dataset] @@ -96,6 +97,9 @@ def main(): params['type'] = 'timeseries' elif arguments['releasebar']: params['type'] = 'releasebar' + if dataseries == 'release': + print("Plotting release by release makes no sense.") + exit(1) # TODO: waffle! # read in defaults from config.toml @@ -107,7 +111,7 @@ def main(): table = params['table'] # maybe docopt isn't the best choice. oh well. - for graphtype in [ 'text', 'stacked', 'share', 'split', 'line' ]: + for graphtype in [ 'text', 'stacked', 'share', 'line' ]: if arguments[graphtype]: params['graph']=graphtype break @@ -197,11 +201,11 @@ def main(): kind = 'line' # overrides 'area' stacked = False # true everywhere else! colormap = cmap - case 'split': - """ This is releasebar-only. """ - df = dataframe[startdate:] - colormap = cmap - subplots = True + #case 'split': + # """ This is releasebar-only. """ + # df = dataframe[startdate:] + # colormap = cmap + # subplots = True @@ -245,7 +249,7 @@ def main(): plt.suptitle(Template(params['title']).safe_substitute(madlibs), fontsize=24) - # FIX: make work + # FIX: make work if 'subtitle' in params: graph.set_title( Template(params['subtitle']).safe_substitute(madlibs), diff --git a/config.toml b/config.toml index 7e31ead..b2b9086 100644 --- a/config.toml +++ b/config.toml @@ -54,6 +54,7 @@ arch="CPU architecture" release="release " variant="variant" age="age category" +week="per week" [age_labels] '0'='Ephemeral' @@ -66,11 +67,13 @@ age="age category" 'line'="" 'stacked'=" (stacked)" 'share'=" (share)" +'split'="" [graph_defaults] table="checkins" title="$dataset_label: $dataseries_label $type_label" -filebase="$timestamp-$dataset-$type-$graph-$dataseries" +subtitle="$extra" +filebase="$timestamp-$dataset-$type-$dataseries$extra-$graph" # TODO: list possible options! [timeseries] diff --git a/run.sh b/run.sh index d150d1e..54dc26b 100755 --- a/run.sh +++ b/run.sh @@ -87,10 +87,10 @@ echo " Built!" echo -n "* Painting the feathers..." rm db/color-cache.toml 2> /dev/null ./brontosaurus_colorizer.py -echo " vibrant!" +echo " Vibrant!" echo "* Drawing portraits from the fossilized remains... " -LINES=$(toml -dj < config.toml |jq '.timeseries[].views.value[].value'|wc -l) + #LINES=$(toml -dj < config.toml |jq '.timeseries[].views.value[].value'|wc -l) #./brontosaurus_plotter.py | pv -F " %p %e" -w60 -l -s $LINES > /dev/null #if [[ $? != 0 ]]; then @@ -100,9 +100,15 @@ LINES=$(toml -dj < config.toml |jq '.timeseries[].views.value[].value'|wc -l) for dataset in fedora_updates_systems epel; do for dataseries in age arch release variant; do for graph in stacked share line; do + echo ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries + echo -n " " ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries done - for graph in stacked share split; do + done + for dataseries in age arch variant; do + for graph in stacked share; do + echo ./brontosaurus_plotter.py releasebar $graph $dataset $dataseries + echo -n " " ./brontosaurus_plotter.py releasebar $graph $dataset $dataseries done done From cb50f9c5aa0f795e1ffd897604d6dd770e18815f Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 25 2022 00:41:34 +0000 Subject: [PATCH 26/49] map the unknowns to one place --- diff --git a/TODO.md b/TODO.md index 03b7537..6eb6283 100644 --- a/TODO.md +++ b/TODO.md @@ -11,9 +11,6 @@ 4. have some script that pre-renders some defaults (PARTIAL) 5. and a simple front-end for exploring the rest -* map "generic" and "unknown" and "none" to "unspecified" - - * better ordering * epel -- need to special-case EL 8 by-release graphs to add peak for @@ -112,10 +109,6 @@ estimate that there's probably 50,000 systems out there running Fedora 20 or older). Let's not forget those! -* fix the code in brotosaurus washer to merge '' to 'none' rather than just - renaming (works now because there are no natural 'none' entries). - - * instead of throwing away entries in the washing phase (especially those below thresholds), write them to a special db for "fun" analysis diff --git a/brontosaurus_washer.sh b/brontosaurus_washer.sh index b655cb7..f4126aa 100755 --- a/brontosaurus_washer.sh +++ b/brontosaurus_washer.sh @@ -13,23 +13,32 @@ # The variant "CentOS Stream v21.*" is some sort of horrible scripted thing # apparently, where it keeps getting longer and longer with additional tags # -# Also, for each table, sets any variant that is '' to 'none', because -# '' is hard to work with. (I think this is when people have manually put -# "VARIANT_ID=", as opposed to not having one. I don't think that's useful -# to track separately from 'generic', really, so an alternative would be -# to merge them... but doing this for now.) -# FIXME: this needs to merge them in case someone starts actually sending -# "none" as the string — we'll get a uniqueness constraint violation. + # # This is a regex, in case that's not clear. GARBAGE_VARIANTS="^(09975472-cc15-4020-9231-cc6743a15b0f|c1e9af0e-5816-4644-9c89-703aa1fdcf04|CentOS Stream v21.*)$" sqlite3 db/bronto.db << EOF DELETE FROM checkins WHERE variant REGEXP "$GARBAGE_VARIANTS"; - UPDATE checkins SET variant='none' WHERE variant=''; EOF +# Change generic, unknown, and '' to all be "unspecified" +# (I think this is when people have manually put "VARIANT_ID=", as opposed +# to not having one. I don't think that's useful +# to track separately from 'generic', really, so an alternative would be +# to merge them... but doing this for now.) +# FIXME: this needs to merge them in case someone starts actually sending +# "none" as the string — we'll get a uniqueness constraint violation. +sqlite3 db/bronto.db << EOF + BEGIN; + INSERT INTO checkins SELECT week,dataset,release,"unspecified",arch,age,sum(hits) FROM checkins + WHERE variant='generic' or variant='none' or variant='unknown' + GROUP BY week,release,arch,age; + DELETE FROM checkins WHERE variant='generic' or variant='none' or variant='unknown'; + COMMIT; +EOF + # While some test systems ran Fedora Linux 31, the feature landed # in 32 (released 2020-04-27, so drop all the old stuff. FEDORA_STARTVER=32 diff --git a/run.sh b/run.sh index 54dc26b..d92cf15 100755 --- a/run.sh +++ b/run.sh @@ -100,15 +100,11 @@ echo "* Drawing portraits from the fossilized remains... " for dataset in fedora_updates_systems epel; do for dataseries in age arch release variant; do for graph in stacked share line; do - echo ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries - echo -n " " ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries done done for dataseries in age arch variant; do for graph in stacked share; do - echo ./brontosaurus_plotter.py releasebar $graph $dataset $dataseries - echo -n " " ./brontosaurus_plotter.py releasebar $graph $dataset $dataseries done done From 195d221a60583707905b5f93598d3bfe390677b6 Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 25 2022 13:02:26 +0000 Subject: [PATCH 27/49] chonky bars. --- diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py index 7621f9d..893027c 100755 --- a/brontosaurus_plotter.py +++ b/brontosaurus_plotter.py @@ -3,17 +3,17 @@ Brontosaurus Plotter Usage: - brontosaurus_plotter.py timeseries ( text | stacked | share | line ) [--exclude ...|--include ...] [--cutoff ] - brontosaurus_plotter.py releasebar ( text | stacked | share ) [--exclude ...|--include ...] [--cutoff ] + brontosaurus_plotter.py timeseries ( text | stacked | share | line ) [--exclude ...|--include ...] [options] + brontosaurus_plotter.py releasebar ( text | stacked | share ) [--exclude ...|--include ...] [options] Options: --exclude ... In the form `dataseries:element`. Can repeat. --include ... As above, but include _only_ these. --cutoff Drop items where the dataseries has less than n total hits + --output Optional output filename (overrides config!) """ #import matplotlib.dates as dates -import matplotlib.pyplot as plt import sqlite3 from string import Template @@ -26,6 +26,7 @@ import toml import re import matplotlib as m +import matplotlib.pyplot as plt from docopt import docopt @@ -46,6 +47,10 @@ m.rcParams['legend.frameon'] = False def main(): + # TODO: separate initialization (leave in main) from rendering one + # image (or a related set)... put that in a function. Then we can + # draw a bunch of similar files with less overhead. + arguments = docopt(__doc__, version='0.1') @@ -172,7 +177,6 @@ def main(): stacked = True - subplots = False match params['type']: case 'releasebar': @@ -209,18 +213,30 @@ def main(): - # Start the actual graph - graph = df.plot(figsize=config['figsize'], - colormap=colormap, - kind=kind, - stacked=stacked, - subplots=subplots) + # Start the actual graph. + # same for both kinds, except width + match kind: + case 'bar': + graph = df.plot(figsize=config['figsize'], + colormap=colormap, + kind=kind, + stacked=stacked, + width=0.95 + ) + case _: + graph = df.plot(figsize=config['figsize'], + colormap=colormap, + kind=kind, + stacked=stacked, + ) # Labels and titles and stuff. ax = plt.gca() handles, labels = ax.get_legend_handles_labels() + + # TODO: generalize this if dataseries == 'age': labels = list(map(config['age_labels'].get, labels)) @@ -260,16 +276,24 @@ def main(): graph.set_ylim([0, None]) graph.spines['right'].set_visible(False) graph.spines['top'].set_visible(False) - sFormatter = m.ticker.ScalarFormatter() - sFormatter.set_scientific(False) - graph.yaxis.set_major_formatter(sFormatter) + graph.yaxis.set_major_formatter(m.ticker.EngFormatter(sep='')) # graph.xaxis.set_major_formatter(dates.DateFormatter('%b \'%y')) graph.set_xlabel('') + # aesthetic pickiness! + if kind == 'bar': + ax.tick_params(bottom=False) + + + # Not sure why these get rotated by default. Unrotate them! + plt.xticks(rotation = 0) + for ext in config['image_types']: graph.figure.savefig(f"images/{ext}/{dataset}/{Template(params['filebase']).safe_substitute(madlibs)}.{ext}", dpi=config['dpi'], bbox_inches="tight") + graph.figure.savefig(f"images/test.png", dpi=config['dpi'], bbox_inches="tight") + plt.close(graph.figure) print(f"{Template(params['filebase']).safe_substitute(madlibs)}.{ext}") diff --git a/brontosaurusifier_utils/colormapping.py b/brontosaurusifier_utils/colormapping.py index c338a07..7611c71 100644 --- a/brontosaurusifier_utils/colormapping.py +++ b/brontosaurusifier_utils/colormapping.py @@ -31,7 +31,7 @@ def load_color_cache(cachefile,presetfile): try: cached = toml.load(cachefile) except FileNotFoundError: - print(f"Can't open color cache {cachefile}, so starting fresh.") + #print(f"Can't open color cache {cachefile}, so starting fresh.") cached = {} try: diff --git a/color-presets.toml b/color-presets.toml index 8c0daf7..76f12bc 100644 --- a/color-presets.toml +++ b/color-presets.toml @@ -6,3 +6,15 @@ "AlmaLinux" = "#ffcc0a" "Oracle Linux Server" = "#101010" "CloudLinux" = "#0097f3" + +["fedora_updates_systems.variant"] +"unspecfied" = "#808080" + +["fedora_updates_containers.variant"] +"unspecfied" = "#808080" + +["fedora_rawhide_systems.variant"] +"unspecfied" = "#808080" + +["fedora_rawhide_containers.variant"] +"unspecfied" = "#808080" diff --git a/run.sh b/run.sh index d92cf15..d2cb1b6 100755 --- a/run.sh +++ b/run.sh @@ -36,7 +36,7 @@ datafreshness echo -n "* Stomping intermediate files... " rm db/bronto.db 2> /dev/null -echo " extinct." +echo " extinct." echo -n "* Fossilizing ancient images... " mkdir -p images/{svg,png} @@ -52,7 +52,7 @@ echo -n "* Slicing brontosauruses... " echo "! Oops." exit 1 fi -echo " into bits." +echo " into bits." echo -n "* Scrubbing off the dirt... " ./brontosaurus_washer.sh @@ -60,7 +60,7 @@ echo -n "* Scrubbing off the dirt... " echo "! Oops." exit 1 fi -echo " shiny!" +echo " shiny!" echo -n "* Finding the strongest... " ./brontosaurus_fight.sh @@ -68,7 +68,7 @@ echo -n "* Finding the strongest... " echo "! Oops." exit 1 fi -echo " rarrhhhhr!" +echo " rarrhhhhr!" echo -n "* Sorting the eggs... " ./brontosaurus_egg-sorter.py @@ -76,18 +76,18 @@ echo -n "* Sorting the eggs... " echo "! Oops." exit 1 fi -echo " binaried." +echo " binaried." -echo "* Creating cages for different exhibits..." +echo -n "* Creating exhibit cages..." for dataset in $(echo 'SELECT DISTINCT(dataset) FROM checkins;' | sqlite3 ./db/bronto.db); do mkdir -p images/{svg,png}/$dataset done -echo " Built!" +echo " built!" echo -n "* Painting the feathers..." rm db/color-cache.toml 2> /dev/null ./brontosaurus_colorizer.py -echo " Vibrant!" +echo " vibrant!" echo "* Drawing portraits from the fossilized remains... " #LINES=$(toml -dj < config.toml |jq '.timeseries[].views.value[].value'|wc -l) From 265ff1f59cfc4a7afcd13aa87f72fa11081b0eed Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 25 2022 13:24:21 +0000 Subject: [PATCH 28/49] oops actually get '' --- diff --git a/brontosaurus_washer.sh b/brontosaurus_washer.sh index f4126aa..36bc013 100755 --- a/brontosaurus_washer.sh +++ b/brontosaurus_washer.sh @@ -33,9 +33,9 @@ EOF sqlite3 db/bronto.db << EOF BEGIN; INSERT INTO checkins SELECT week,dataset,release,"unspecified",arch,age,sum(hits) FROM checkins - WHERE variant='generic' or variant='none' or variant='unknown' + WHERE variant='generic' or variant='' or variant='none' or variant='unknown' GROUP BY week,release,arch,age; - DELETE FROM checkins WHERE variant='generic' or variant='none' or variant='unknown'; + DELETE FROM checkins WHERE variant='generic' or variant='' or variant='none' or variant='unknown'; COMMIT; EOF From 54a3210dd4829227408f1e163dd283685cb9f93a Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 25 2022 16:13:12 +0000 Subject: [PATCH 29/49] "other" works --- diff --git a/TODO.md b/TODO.md index 6eb6283..c54c9ef 100644 --- a/TODO.md +++ b/TODO.md @@ -16,9 +16,6 @@ * epel -- need to special-case EL 8 by-release graphs to add peak for both before and after CentOS Linux 8 EOL -* fix it so colors don't overlap when there's more than 12 options. - Best bet: lump everything after 11 into "other". - * change the timeseries "hide" to collect small things into "other" * text reports!!! * this week / last week / year-over-year diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py index 893027c..6de8639 100755 --- a/brontosaurus_plotter.py +++ b/brontosaurus_plotter.py @@ -32,8 +32,6 @@ from docopt import docopt from brontosaurusifier_utils import colormapping -DATAFILE = 'db/bronto.db' - m.use("Agg") m.style.use('seaborn-colorblind') @@ -58,7 +56,7 @@ def main(): colormappings = colormapping.load_color_cache(config['color_cache'],config['color_presets']) - database = sqlite3.connect(DATAFILE, detect_types=sqlite3.PARSE_DECLTYPES) + database = sqlite3.connect(config['datafile'], detect_types=sqlite3.PARSE_DECLTYPES) cur = database.cursor() cur.execute("SELECT DATE(week, '6 days') FROM checkins ORDER BY week DESC LIMIT 1") @@ -147,30 +145,44 @@ def main(): df = pd.read_sql_query(query, parse_dates='week', con=database) - dataframe=df.pivot(index=xaxis, + datatable=df.pivot(index=xaxis, columns=dataseries, values='hits' ).astype("Int64") # Smooth over any missing data #if xaxis=='week': - # dataframe.resample('W-MON') - #pprint(dataframe) + # datatable.resample('W-MON') + #pprint(datatable) ################# - # Instead of this, accumulate anything more than 10 into "other" - # ... and do it _elsewhere_ (easier to do before pivot anyway!) - # + limit number of columns to 10 + other - - hidelist = dataframe.div(dataframe.sum(axis=1), axis=0).max() < 0.2/100 - dataframe.drop(columns=(hidelist[hidelist == True].keys()), inplace=True) + # Find the items below thresholds for percent in any given + # dataseries entry, and also for excess number of items. + # Bin 'em into "other" + # TODO: weight this towards the end of the data, so we don't drop + # emerging interesting things in favor of old news? + toosmall = datatable.div(datatable.sum(axis=1), axis=0).max() < config['minpercent'] / 100 + others = toosmall[toosmall == True].keys() + othercol = datatable[others].sum(axis=1).astype("Int64") + datatable.drop(columns=others, inplace=True) + + # still too big? + if len(datatable.columns) > config['maxitems']: + # the -1 in `config['maxitems']-1` is so we don't exceed the + # limit by adding the "others" column! + others = datatable.sum().sort_values(ascending=False)[config['maxitems']-1:].index + othercol += datatable[others].sum(axis=1).astype("Int64") + datatable.drop(columns=others, inplace=True) + + if othercol.any(): + datatable['other'] = othercol ################## # our colors. # the complication here is keeping the same color for the same label # across multiple graphs! cmap = m.colors.ListedColormap(colormapping.get_colors(colormappings, config['colors'], - dataset, dataseries, list(dataframe.columns))) + dataset, dataseries, list(datatable.columns))) ################## # and now.... graph it! @@ -186,28 +198,28 @@ def main(): match params['graph']: case 'stacked': - df = dataframe[startdate:][dataframe.columns[::-1]] + df = datatable[startdate:][datatable.columns[::-1]] colormap = m.colors.ListedColormap(cmap.colors[::-1]) case 'share': if dataseries == 'age': # lower numbers are newer! - df = dataframe[startdate:][dataframe.columns[::-1]].div( - dataframe.sum(axis=1), axis=0)*100 + df = datatable[startdate:][datatable.columns[::-1]].div( + datatable.sum(axis=1), axis=0)*100 colormap = m.colors.ListedColormap(cmap.colors[::-1]) else: # todo: sort arch and variant by popularity, not name! - df = dataframe[startdate:].div( - dataframe.sum(axis=1), axis=0)*100 + df = datatable[startdate:].div( + datatable.sum(axis=1), axis=0)*100 colormap = cmap case 'line': """ This is timeseries-only. """ - df = dataframe[startdate:] + df = datatable[startdate:] kind = 'line' # overrides 'area' stacked = False # true everywhere else! colormap = cmap #case 'split': # """ This is releasebar-only. """ - # df = dataframe[startdate:] + # df = datatable[startdate:] # colormap = cmap # subplots = True @@ -292,7 +304,7 @@ def main(): graph.figure.savefig(f"images/{ext}/{dataset}/{Template(params['filebase']).safe_substitute(madlibs)}.{ext}", dpi=config['dpi'], bbox_inches="tight") - graph.figure.savefig(f"images/test.png", dpi=config['dpi'], bbox_inches="tight") + #graph.figure.savefig(f"images/test.png", dpi=config['dpi'], bbox_inches="tight") plt.close(graph.figure) print(f"{Template(params['filebase']).safe_substitute(madlibs)}.{ext}") diff --git a/brontosaurusifier_utils/colormapping.py b/brontosaurusifier_utils/colormapping.py index 7611c71..04502ae 100644 --- a/brontosaurusifier_utils/colormapping.py +++ b/brontosaurusifier_utils/colormapping.py @@ -19,7 +19,10 @@ def get_colors(colormappings, colorlist, dataset, dataseries, items): outcolors = [] for item in items: if str(item) not in colormappings[key]: - colormappings[key][str(item)] = colorlist[len(colormappings[key]) % len(colorlist)] + if item == 'other': + colormappings[key][str(item)] = colorlist[-1] + else: + colormappings[key][str(item)] = colorlist[len(colormappings[key]) % len(colorlist)] outcolors.append(colormappings[key][str(item)]) return outcolors diff --git a/color-presets.toml b/color-presets.toml index 76f12bc..6aefa12 100644 --- a/color-presets.toml +++ b/color-presets.toml @@ -1,5 +1,5 @@ ["epel.variant"] -"CentOS Linux" = "#808080" +"CentOS Linux" = "#a0a0a0" "Red Hat Enterprise Linux" = "#ee0000" "CentOS Stream" = "#a14a8c" "Rocky Linux" = "#10b981" @@ -8,13 +8,13 @@ "CloudLinux" = "#0097f3" ["fedora_updates_systems.variant"] -"unspecfied" = "#808080" +"unspecified" = "#cccccc" ["fedora_updates_containers.variant"] -"unspecfied" = "#808080" +"unspecified" = "#cccccc" ["fedora_rawhide_systems.variant"] -"unspecfied" = "#808080" +"unspecified" = "#cccccc" ["fedora_rawhide_containers.variant"] -"unspecfied" = "#808080" +"unspecified" = "#cccccc" diff --git a/config.toml b/config.toml index b2b9086..cf12d38 100644 --- a/config.toml +++ b/config.toml @@ -8,12 +8,24 @@ imagepath="images/$filetype/$dataset/" figsize = [16, 9] dpi = 300 +# Entries where the highest value for any +# week (or release) is not above this +# percent of the total for that week (or +# release) will be binned together into +# "other" +minpercent = 0.5 -# Our palette. Note that this also limits the -# number of items per chart. If there are -# more than the number of colors, the last -# color here becomes "other". -# (TODO! Implement that!) + +# Also bin excess entries with "other". +# Note that this limit *does* include the +# "other" line, if any. +maxitems = 10 + +# Our palette. Note that if `maxitems` +# is greater than the number of options in +# this list, it will cycle around! +# Also note: "other" is automatically the +# _last_ color in the list. colors = [ '#51a2da', '#294172', @@ -29,7 +41,6 @@ colors = [ '#aad0ee', '#101010', '#535961', - '#808080', ] # could be png, pdf, svg diff --git a/run.sh b/run.sh index d2cb1b6..e1cb461 100755 --- a/run.sh +++ b/run.sh @@ -36,7 +36,7 @@ datafreshness echo -n "* Stomping intermediate files... " rm db/bronto.db 2> /dev/null -echo " extinct." +echo "extinct." echo -n "* Fossilizing ancient images... " mkdir -p images/{svg,png} @@ -44,7 +44,7 @@ echo -n "* Fossilizing ancient images... " rm images/png/* 2> /dev/null rm images/svg/*/* 2> /dev/null rm images/png/*/* 2> /dev/null -echo " buried." +echo " buried." echo -n "* Slicing brontosauruses... " ./brontosaurus_slicer.sh @@ -52,7 +52,7 @@ echo -n "* Slicing brontosauruses... " echo "! Oops." exit 1 fi -echo " into bits." +echo " into bits." echo -n "* Scrubbing off the dirt... " ./brontosaurus_washer.sh @@ -60,7 +60,7 @@ echo -n "* Scrubbing off the dirt... " echo "! Oops." exit 1 fi -echo " shiny!" +echo " shiny!" echo -n "* Finding the strongest... " ./brontosaurus_fight.sh @@ -68,7 +68,7 @@ echo -n "* Finding the strongest... " echo "! Oops." exit 1 fi -echo " rarrhhhhr!" +echo " rarrhhhhr!" echo -n "* Sorting the eggs... " ./brontosaurus_egg-sorter.py @@ -82,7 +82,7 @@ echo -n "* Creating exhibit cages..." for dataset in $(echo 'SELECT DISTINCT(dataset) FROM checkins;' | sqlite3 ./db/bronto.db); do mkdir -p images/{svg,png}/$dataset done -echo " built!" +echo " built!" echo -n "* Painting the feathers..." rm db/color-cache.toml 2> /dev/null From 1cc61ede81f9d3c569e47f787dc266d48de4cdbc Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 25 2022 16:23:57 +0000 Subject: [PATCH 30/49] wash better --- diff --git a/brontosaurus_washer.sh b/brontosaurus_washer.sh index 36bc013..217dd88 100755 --- a/brontosaurus_washer.sh +++ b/brontosaurus_washer.sh @@ -1,9 +1,32 @@ #!/bin/bash # -# For every table in bronto.db, delete "known bad" variants. -# +function counthits() { + true + #echo 'SELECT sum(hits) FROM checkins' | sqlite3 db/bronto.db; +} + +counthits + +# While some test systems ran Fedora Linux 31, the feature landed +# in 32 (released 2020-04-27, so drop all the old stuff. +FEDORA_STARTVER=32 +FEDORA_STARTDAY='2021-01-01' +# And same for EPEL, with EL 8 (DNF feature launched in 8.3 at end of 2020) +EPEL_STARTVER=8 +EPEL_STARTDAY='2021-01-01' +sqlite3 db/bronto.db << EOF + DELETE FROM checkins WHERE dataset GLOB "fedora*" AND release < $FEDORA_STARTVER; + DELETE FROM checkins WHERE dataset GLOB "fedora*" AND week < "$FEDORA_STARTDAY"; + DELETE FROM checkins WHERE dataset GLOB "epel*" AND release < $EPEL_STARTVER; + DELETE FROM checkins WHERE dataset GLOB "epel*" AND week < "$EPEL_STARTDAY"; +EOF +counthits + + +# For every table in bronto.db, delete "known bad" variants. +# # Please document each new reason for cleaning something here, so we know # why later. # @@ -13,7 +36,6 @@ # The variant "CentOS Stream v21.*" is some sort of horrible scripted thing # apparently, where it keeps getting longer and longer with additional tags # - # # This is a regex, in case that's not clear. GARBAGE_VARIANTS="^(09975472-cc15-4020-9231-cc6743a15b0f|c1e9af0e-5816-4644-9c89-703aa1fdcf04|CentOS Stream v21.*)$" @@ -21,6 +43,7 @@ GARBAGE_VARIANTS="^(09975472-cc15-4020-9231-cc6743a15b0f|c1e9af0e-5816-4644-9c89 sqlite3 db/bronto.db << EOF DELETE FROM checkins WHERE variant REGEXP "$GARBAGE_VARIANTS"; EOF +counthits # Change generic, unknown, and '' to all be "unspecified" @@ -38,20 +61,7 @@ sqlite3 db/bronto.db << EOF DELETE FROM checkins WHERE variant='generic' or variant='' or variant='none' or variant='unknown'; COMMIT; EOF - -# While some test systems ran Fedora Linux 31, the feature landed -# in 32 (released 2020-04-27, so drop all the old stuff. -FEDORA_STARTVER=32 -FEDORA_STARTDAY='2021-01-01' -# And same for EPEL, with EL 8 (DNF feature launched in 8.3 at end of 2020) -EPEL_STARTVER=8 -EPEL_STARTDAY='2021-01-01' -sqlite3 db/bronto.db << EOF - DELETE FROM checkins WHERE dataset GLOB "fedora*" AND release < $FEDORA_STARTVER; - DELETE FROM checkins WHERE dataset GLOB "fedora*" AND week < "$FEDORA_STARTDAY"; - DELETE FROM checkins WHERE dataset GLOB "epel*" AND release < $EPEL_STARTVER; - DELETE FROM checkins WHERE dataset GLOB "epel*" AND week < "$EPEL_STARTDAY"; -EOF +counthits # Clean up entries for name, arch, or release that show up # with less than some threshold in _total_ hits in the @@ -75,4 +85,5 @@ for GROUP in $(echo 'SELECT DISTINCT(dataset) FROM checkins;' | sqlite3 ./db/bro DELETE FROM checkins WHERE dataset = "$GROUP" AND release IN (SELECT release FROM checkins WHERE dataset = "$GROUP" GROUP BY release HAVING SUM(hits) < $THRESHOLD_TOTAL); DELETE FROM checkins WHERE dataset = "$GROUP" AND release IN (SELECT release FROM checkins WHERE dataset = "$GROUP" GROUP BY release HAVING SUM(hits) < $THRESHOLD_WEEKLY); EOF +counthits done From 4db0ac5c67fd8df2462495d9461cadeef4ce005c Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 25 2022 16:46:00 +0000 Subject: [PATCH 31/49] don't wash away datasets! --- diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py index 6de8639..4843a7a 100755 --- a/brontosaurus_plotter.py +++ b/brontosaurus_plotter.py @@ -93,8 +93,6 @@ def main(): print(f"Dataseries '{dataseries}' not in database.") exit(1) - startdate = config['startdate'][dataset] - if arguments['timeseries']: params['type'] = 'timeseries' @@ -198,28 +196,27 @@ def main(): match params['graph']: case 'stacked': - df = datatable[startdate:][datatable.columns[::-1]] + df = datatable[datatable.columns[::-1]] colormap = m.colors.ListedColormap(cmap.colors[::-1]) case 'share': if dataseries == 'age': # lower numbers are newer! - df = datatable[startdate:][datatable.columns[::-1]].div( + df = datatable[datatable.columns[::-1]].div( datatable.sum(axis=1), axis=0)*100 colormap = m.colors.ListedColormap(cmap.colors[::-1]) else: # todo: sort arch and variant by popularity, not name! - df = datatable[startdate:].div( - datatable.sum(axis=1), axis=0)*100 + df = datatable.div(datatable.sum(axis=1), axis=0)*100 colormap = cmap case 'line': """ This is timeseries-only. """ - df = datatable[startdate:] + df = datatable kind = 'line' # overrides 'area' stacked = False # true everywhere else! colormap = cmap #case 'split': # """ This is releasebar-only. """ - # df = datatable[startdate:] + # df = datatable # colormap = cmap # subplots = True diff --git a/brontosaurus_washer.sh b/brontosaurus_washer.sh index 217dd88..1e3b7da 100755 --- a/brontosaurus_washer.sh +++ b/brontosaurus_washer.sh @@ -57,7 +57,7 @@ sqlite3 db/bronto.db << EOF BEGIN; INSERT INTO checkins SELECT week,dataset,release,"unspecified",arch,age,sum(hits) FROM checkins WHERE variant='generic' or variant='' or variant='none' or variant='unknown' - GROUP BY week,release,arch,age; + GROUP BY week, dataset, release, arch, age; DELETE FROM checkins WHERE variant='generic' or variant='' or variant='none' or variant='unknown'; COMMIT; EOF diff --git a/config.toml b/config.toml index cf12d38..5f5e70c 100644 --- a/config.toml +++ b/config.toml @@ -46,13 +46,6 @@ colors = [ # could be png, pdf, svg image_types = ["png"] -[startdate] -fedora_updates_systems = '2021-01-01' # F32 release not fully captured, so start here. -fedora_updates_containers = '2021-01-01' -fedora_rawhide_systems = '2021-01-01' -fedora_rawhide_containers = '2021-01-01' -epel = '2021-01-01' # DNF feature launched in 8.3 at end of 2020 - [dataset_labels] epel = "Extra Packages for Enterprise Linux" fedora_updates_systems = "Fedora Linux systems" From 883de6061fc53adf4580a61a8cce113801153b66 Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 25 2022 20:22:38 +0000 Subject: [PATCH 32/49] better othering --- diff --git a/brontosaurus_fight.sh b/brontosaurus_fight.sh index 5726335..a322fd4 100755 --- a/brontosaurus_fight.sh +++ b/brontosaurus_fight.sh @@ -3,6 +3,10 @@ # Create a view which only shows the weeks where each release # is at its peak. If someone actually is Good At SQL, I would # not mind help making this more clear. +# +# Also, note the special casing for Fedora Linux 32 (or earlier). +# F32 is useful in the timeline view, but is past its peak +# at the data start date, so we don't want it here. sqlite3 db/bronto.db << EOF DROP VIEW IF EXISTS peak; @@ -19,6 +23,7 @@ sqlite3 db/bronto.db << EOF (SELECT week,dataset,release,max(hits) FROM (SELECT week,dataset,release,sum(hits) AS hits FROM checkins + WHERE ( release>'32' OR dataset not like 'fedora%' ) GROUP BY week,dataset,release ORDER BY week) GROUP BY dataset,release) AS peaks diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py index 4843a7a..b380940 100755 --- a/brontosaurus_plotter.py +++ b/brontosaurus_plotter.py @@ -9,7 +9,6 @@ Usage: Options: --exclude ... In the form `dataseries:element`. Can repeat. --include ... As above, but include _only_ these. - --cutoff Drop items where the dataseries has less than n total hits --output Optional output filename (overrides config!) """ @@ -106,8 +105,11 @@ def main(): # read in defaults from config.toml params.update(config[params['type']]) - #pprint(params) - + match params['type']: + case 'releasebar': + kind = 'bar' + case 'timeseries': + kind = 'area' table = params['table'] @@ -134,6 +136,7 @@ def main(): xaxis = params['xaxis'] + query = f"""SELECT {xaxis}, {dataseries}, SUM(hits) as hits FROM {table} WHERE dataset =\"{dataset}\" @@ -153,28 +156,37 @@ def main(): # datatable.resample('W-MON') #pprint(datatable) - ################# # Find the items below thresholds for percent in any given - # dataseries entry, and also for excess number of items. - # Bin 'em into "other" + # dataseries entry, and also for excess number of items, + # and bin them together into "other" + # # TODO: weight this towards the end of the data, so we don't drop # emerging interesting things in favor of old news? toosmall = datatable.div(datatable.sum(axis=1), axis=0).max() < config['minpercent'] / 100 others = toosmall[toosmall == True].keys() - othercol = datatable[others].sum(axis=1).astype("Int64") + othercol = datatable[others].sum(axis=1).astype("Int64") datatable.drop(columns=others, inplace=True) # still too big? if len(datatable.columns) > config['maxitems']: # the -1 in `config['maxitems']-1` is so we don't exceed the # limit by adding the "others" column! - others = datatable.sum().sort_values(ascending=False)[config['maxitems']-1:].index + others = datatable.sum(axis=0).sort_values(ascending=False)[config['maxitems']-1:].index othercol += datatable[others].sum(axis=1).astype("Int64") datatable.drop(columns=others, inplace=True) - if othercol.any(): + # if the remaining "other" ends up big enough to matter, add it to the table + # the division is: highest row (release, say) for the item, compared to the total for that row + if othercol.any() and othercol.max() / datatable.sum(axis=1).max() >= config['minpercent'] / 100: datatable['other'] = othercol + # For bar charts, drop any rows (bars) which are below the threshold + if kind == 'bar': + toosmall=datatable.sum(axis=1)/datatable.sum(axis=1).max() < config['minpercent'] / 100 + datatable.drop(toosmall[toosmall == True].keys(), inplace=True) + + + ################## # our colors. # the complication here is keeping the same color for the same label @@ -188,35 +200,30 @@ def main(): stacked = True - match params['type']: - case 'releasebar': - kind = 'bar' - case 'timeseries': - kind = 'area' + match params['graph']: case 'stacked': - df = datatable[datatable.columns[::-1]] + datatable = datatable[datatable.columns[::-1]] colormap = m.colors.ListedColormap(cmap.colors[::-1]) case 'share': if dataseries == 'age': # lower numbers are newer! - df = datatable[datatable.columns[::-1]].div( + datatable = datatable[datatable.columns[::-1]].div( datatable.sum(axis=1), axis=0)*100 colormap = m.colors.ListedColormap(cmap.colors[::-1]) else: # todo: sort arch and variant by popularity, not name! - df = datatable.div(datatable.sum(axis=1), axis=0)*100 + datatable = datatable.div(datatable.sum(axis=1), axis=0)*100 colormap = cmap case 'line': """ This is timeseries-only. """ - df = datatable + #datatable = datatable kind = 'line' # overrides 'area' stacked = False # true everywhere else! colormap = cmap #case 'split': # """ This is releasebar-only. """ - # df = datatable # colormap = cmap # subplots = True @@ -226,14 +233,14 @@ def main(): # same for both kinds, except width match kind: case 'bar': - graph = df.plot(figsize=config['figsize'], + graph = datatable.plot(figsize=config['figsize'], colormap=colormap, kind=kind, stacked=stacked, width=0.95 ) case _: - graph = df.plot(figsize=config['figsize'], + graph = datatable.plot(figsize=config['figsize'], colormap=colormap, kind=kind, stacked=stacked, diff --git a/color-presets.toml b/color-presets.toml index 6aefa12..7962223 100644 --- a/color-presets.toml +++ b/color-presets.toml @@ -8,13 +8,13 @@ "CloudLinux" = "#0097f3" ["fedora_updates_systems.variant"] -"unspecified" = "#cccccc" +"unspecified" = "#a0a0a0" ["fedora_updates_containers.variant"] -"unspecified" = "#cccccc" +"unspecified" = "#a0a0a0" ["fedora_rawhide_systems.variant"] -"unspecified" = "#cccccc" +"unspecified" = "#a0a0a0" ["fedora_rawhide_containers.variant"] -"unspecified" = "#cccccc" +"unspecified" = "#a0a0a0" diff --git a/config.toml b/config.toml index 5f5e70c..601f2e0 100644 --- a/config.toml +++ b/config.toml @@ -46,6 +46,7 @@ colors = [ # could be png, pdf, svg image_types = ["png"] + [dataset_labels] epel = "Extra Packages for Enterprise Linux" fedora_updates_systems = "Fedora Linux systems" From a6af5fa228398efcda70a63d6a031af169c77df8 Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 25 2022 20:34:42 +0000 Subject: [PATCH 33/49] do the other datasets too --- diff --git a/run.sh b/run.sh index e1cb461..0475084 100755 --- a/run.sh +++ b/run.sh @@ -97,7 +97,7 @@ echo "* Drawing portraits from the fossilized remains... " # echo "! Oops." # exit 1 # fi - for dataset in fedora_updates_systems epel; do + for dataset in fedora_updates_systems epel fedora_rawhide_systems fedora_updates_containers fedora_rawhide_containers; do for dataseries in age arch release variant; do for graph in stacked share line; do ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries From 1ae5d956fd1b2cc5dc93c3785d473b44b77b917a Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 25 2022 20:49:24 +0000 Subject: [PATCH 34/49] um, yeah. simplify the colormap logic --- diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py index b380940..87365eb 100755 --- a/brontosaurus_plotter.py +++ b/brontosaurus_plotter.py @@ -186,48 +186,46 @@ def main(): datatable.drop(toosmall[toosmall == True].keys(), inplace=True) + # If the items in this dataset aren't numeric, + # sort columns by weight + #if not datatable.columns.str.isnumeric().all(): - ################## - # our colors. - # the complication here is keeping the same color for the same label - # across multiple graphs! - cmap = m.colors.ListedColormap(colormapping.get_colors(colormappings, config['colors'], - dataset, dataseries, list(datatable.columns))) - ################## - # and now.... graph it! - stacked = True - + + + + + stacked = True match params['graph']: case 'stacked': datatable = datatable[datatable.columns[::-1]] - colormap = m.colors.ListedColormap(cmap.colors[::-1]) case 'share': if dataseries == 'age': # lower numbers are newer! datatable = datatable[datatable.columns[::-1]].div( datatable.sum(axis=1), axis=0)*100 - colormap = m.colors.ListedColormap(cmap.colors[::-1]) else: # todo: sort arch and variant by popularity, not name! datatable = datatable.div(datatable.sum(axis=1), axis=0)*100 - colormap = cmap case 'line': """ This is timeseries-only. """ #datatable = datatable kind = 'line' # overrides 'area' stacked = False # true everywhere else! - colormap = cmap #case 'split': # """ This is releasebar-only. """ # colormap = cmap # subplots = True - - + + # our colors. + # the complication here is keeping the same color for the same label + # across multiple graphs! + colormap = m.colors.ListedColormap(colormapping.get_colors(colormappings, config['colors'], + dataset, dataseries, list(datatable.columns))) # Start the actual graph. # same for both kinds, except width @@ -258,7 +256,7 @@ def main(): labels = list(map(config['age_labels'].get, labels)) # hmmm - if params['graph'] == 'stacked' or params['graph'] == 'shared': + if params['graph'] == 'stacked' or params['graph'] == 'share': handles[:] = handles[::-1] labels[:] = labels[::-1] From e9196b6faf964e3bc3cc08ffb4cc48dc0a4a4f62 Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 25 2022 21:35:30 +0000 Subject: [PATCH 35/49] better ordering -- good enough for now! --- diff --git a/TODO.md b/TODO.md index c54c9ef..8d01bbc 100644 --- a/TODO.md +++ b/TODO.md @@ -11,7 +11,6 @@ 4. have some script that pre-renders some defaults (PARTIAL) 5. and a simple front-end for exploring the rest -* better ordering * epel -- need to special-case EL 8 by-release graphs to add peak for both before and after CentOS Linux 8 EOL diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py index 87365eb..f57f811 100755 --- a/brontosaurus_plotter.py +++ b/brontosaurus_plotter.py @@ -188,37 +188,33 @@ def main(): # If the items in this dataset aren't numeric, # sort columns by weight - #if not datatable.columns.str.isnumeric().all(): - - - - - - - - + if datatable.columns.dtype != 'int64': + if not datatable.columns.str.isnumeric().all(): + datatable = datatable.reindex(datatable.sum(axis=0).sort_values(ascending=True).index,axis=1) + if dataseries == 'release': + # treat more recent releases as "lower" conceptually + datatable = datatable[datatable.columns[::-1]] stacked = True + yformatter = m.ticker.EngFormatter(sep='') match params['graph']: case 'stacked': + # invert! datatable = datatable[datatable.columns[::-1]] case 'share': - if dataseries == 'age': - # lower numbers are newer! - datatable = datatable[datatable.columns[::-1]].div( - datatable.sum(axis=1), axis=0)*100 - else: - # todo: sort arch and variant by popularity, not name! - datatable = datatable.div(datatable.sum(axis=1), axis=0)*100 + # also invert + datatable = datatable[datatable.columns[::-1]] + # and convert to percent + datatable = datatable.div(datatable.sum(axis=1), axis=0)*100 + yformatter = m.ticker.PercentFormatter() case 'line': """ This is timeseries-only. """ - #datatable = datatable kind = 'line' # overrides 'area' stacked = False # true everywhere else! #case 'split': # """ This is releasebar-only. """ - # colormap = cmap + # subplots = True # our colors. @@ -248,17 +244,19 @@ def main(): ax = plt.gca() handles, labels = ax.get_legend_handles_labels() - + # default direction seems backwards to me! + handles[:] = handles[::-1] + labels[:] = labels[::-1] # TODO: generalize this if dataseries == 'age': labels = list(map(config['age_labels'].get, labels)) - - # hmmm - if params['graph'] == 'stacked' or params['graph'] == 'share': - handles[:] = handles[::-1] - labels[:] = labels[::-1] + if kind == 'line': + # put it back the other way for this case! + handles[:] = handles[::-1] + labels[:] = labels[::-1] + plt.legend(handles, labels, loc='center left', bbox_to_anchor=(1.0, 0.5)) @@ -290,7 +288,7 @@ def main(): graph.set_ylim([0, None]) graph.spines['right'].set_visible(False) graph.spines['top'].set_visible(False) - graph.yaxis.set_major_formatter(m.ticker.EngFormatter(sep='')) + graph.yaxis.set_major_formatter(yformatter) # graph.xaxis.set_major_formatter(dates.DateFormatter('%b \'%y')) graph.set_xlabel('') From 97eb8055ed407873000461417691bcba1f6bbf85 Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 26 2022 00:23:01 +0000 Subject: [PATCH 36/49] ehhh this is not good. but getting there! --- diff --git a/TODO.md b/TODO.md index 8d01bbc..e39af67 100644 --- a/TODO.md +++ b/TODO.md @@ -15,6 +15,7 @@ * epel -- need to special-case EL 8 by-release graphs to add peak for both before and after CentOS Linux 8 EOL +* add "remove bad characters!" from cleanup script * text reports!!! * this week / last week / year-over-year diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py index f57f811..899d23f 100755 --- a/brontosaurus_plotter.py +++ b/brontosaurus_plotter.py @@ -3,16 +3,13 @@ Brontosaurus Plotter Usage: - brontosaurus_plotter.py timeseries ( text | stacked | share | line ) [--exclude ...|--include ...] [options] - brontosaurus_plotter.py releasebar ( text | stacked | share ) [--exclude ...|--include ...] [options] + brontosaurus_plotter.py timeseries ( text | stacked | share | line ) [ ... ] [options] + brontosaurus_plotter.py releasebar ( text | stacked | share ) [ ... ] [options] Options: - --exclude ... In the form `dataseries:element`. Can repeat. - --include ... As above, but include _only_ these. - --output Optional output filename (overrides config!) + --output Optional output filename (overrides config!) """ -#import matplotlib.dates as dates import sqlite3 from string import Template @@ -29,6 +26,8 @@ import matplotlib.pyplot as plt from docopt import docopt +from collections import defaultdict + from brontosaurusifier_utils import colormapping m.use("Agg") @@ -73,8 +72,6 @@ def main(): dataserieses.remove('hits') dataserieses.remove('week') - #pprint(arguments) - dataset=arguments[''] dataseries=arguments[''] @@ -90,9 +87,47 @@ def main(): exit(1) if not dataseries in dataserieses: print(f"Dataseries '{dataseries}' not in database.") - exit(1) + exit(1) + filterincludes = defaultdict(set) + filter = "" + + if arguments['']: + for f in arguments['']: + if "'" in f: + # TODO: actual, good validation + print("No please.") + exit(2) + try: + (filterseries,filteritem) = f.split('+',1) + filterincludes[filterseries].add(filteritem) + except ValueError: + try: + (filterseries,filteritem) = f.split('-',1) + filter+=f" AND {filterseries} != '{filteritem}'" + except ValueError: + print(f"Filter `{f}` is not valid. Must be `dataseries+item` or `dataseries-item`") + exit(1) + if not filterseries in dataserieses: + print(f"Filter `{f}` doesn't match a dataseries. (Try `variant`, `release`, `arch`, or `age`.)") + exit(1) + + # TODO: stop this nonsense, use a proper ORM + + for (incseries,incitems) in filterincludes.items(): + # TODO parens only if needed + filter += " AND (" + inclist=set() + for incitem in incitems: + inclist.add(f"{incseries} = {incitem}") + filter += " OR ".join(inclist) + filter += " ) " + if filter: + filterstring = ":" + ":".join(sorted(arguments[''])).replace(" ","_") + else: + filterstring = "" + if arguments['timeseries']: params['type'] = 'timeseries' elif arguments['releasebar']: @@ -120,14 +155,16 @@ def main(): break - if params['graph'] == 'text': + if params['graph'] == 'text': query = f"""SELECT {dataseries},sum(hits) AS total FROM {table} WHERE dataset = '{dataset}' + {filter} GROUP BY {dataseries} ORDER BY total ASC """ + cur.execute(query) # TODO: add title here for (item,hits) in cur: @@ -140,9 +177,11 @@ def main(): query = f"""SELECT {xaxis}, {dataseries}, SUM(hits) as hits FROM {table} WHERE dataset =\"{dataset}\" + {filter} GROUP BY {xaxis}, {dataseries} ORDER BY {xaxis} """ + print(query) df = pd.read_sql_query(query, parse_dates='week', con=database) @@ -271,6 +310,7 @@ def main(): 'dataset_label': config['dataset_labels'][dataset], 'view_label': config['view_labels'][params['graph']], 'type_label': params['label'], + 'filter' : filterstring, } if 'title' in params: @@ -304,7 +344,7 @@ def main(): graph.figure.savefig(f"images/{ext}/{dataset}/{Template(params['filebase']).safe_substitute(madlibs)}.{ext}", dpi=config['dpi'], bbox_inches="tight") - #graph.figure.savefig(f"images/test.png", dpi=config['dpi'], bbox_inches="tight") + graph.figure.savefig(f"images/test.png", dpi=config['dpi'], bbox_inches="tight") plt.close(graph.figure) print(f"{Template(params['filebase']).safe_substitute(madlibs)}.{ext}") diff --git a/config.toml b/config.toml index 601f2e0..95a10e5 100644 --- a/config.toml +++ b/config.toml @@ -6,7 +6,7 @@ color_cache = "db/color-cache.toml" imagepath="images/$filetype/$dataset/" figsize = [16, 9] -dpi = 300 +dpi = 150 # Entries where the highest value for any # week (or release) is not above this @@ -77,13 +77,12 @@ week="per week" [graph_defaults] table="checkins" title="$dataset_label: $dataseries_label $type_label" -subtitle="$extra" -filebase="$timestamp-$dataset-$type-$dataseries$extra-$graph" +subtitle="$filter" +filebase="$timestamp-$dataset-$type-$graph-$dataseries$filter" # TODO: list possible options! [timeseries] label="weekly checkins" -subtitle="" xaxis = "week" extraselect="" @@ -91,7 +90,7 @@ extraselect="" label="by release" table="peak" xaxis = "release" -subtitle="data for each release taken from the week of that release's (current) peak" +subtitle="$filter (at release peak)" [waffleplot] subtitle="tk!" \ No newline at end of file From eeff2528d7c680b2c4d0a3174bb9683d08db8f19 Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 26 2022 14:55:27 +0000 Subject: [PATCH 37/49] there that's better -- minpercent now works --- diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py index 899d23f..9ea613e 100755 --- a/brontosaurus_plotter.py +++ b/brontosaurus_plotter.py @@ -181,7 +181,6 @@ def main(): GROUP BY {xaxis}, {dataseries} ORDER BY {xaxis} """ - print(query) df = pd.read_sql_query(query, parse_dates='week', con=database) @@ -199,9 +198,24 @@ def main(): # dataseries entry, and also for excess number of items, # and bin them together into "other" # + # We consider three things: + # + # * percent of at least one row (week or release + # depending on chart type) must exceed minpercent + # + # * percent of total must also. + # + # * but wait, if the percent of any of the last four + # rows is above the threshold, keep that after all + # # TODO: weight this towards the end of the data, so we don't drop # emerging interesting things in favor of old news? - toosmall = datatable.div(datatable.sum(axis=1), axis=0).max() < config['minpercent'] / 100 + # + # old way: toosmall = datatable.div(datatable.sum(axis=1), axis=0).max() < config['minpercent'] / 100 + # + toosmall = datatable[datatable==datatable.max()].div(datatable.sum(axis=1),axis=0).max() < config['minpercent'] / 100 + toosmall |= datatable.sum()/datatable.sum().sum() < config['minpercent'] / 100 + toosmall &= datatable[-4:].div(datatable.sum(axis=1),axis=0).fillna(0).max() < config['minpercent'] / 100 others = toosmall[toosmall == True].keys() othercol = datatable[others].sum(axis=1).astype("Int64") datatable.drop(columns=others, inplace=True) diff --git a/config.toml b/config.toml index 95a10e5..6db9d5f 100644 --- a/config.toml +++ b/config.toml @@ -21,6 +21,7 @@ minpercent = 0.5 # "other" line, if any. maxitems = 10 + # Our palette. Note that if `maxitems` # is greater than the number of options in # this list, it will cycle around! From d7e22c27cf57e7f0990077bf59d858cfcdd89a9a Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 26 2022 15:26:57 +0000 Subject: [PATCH 38/49] optimization notes --- diff --git a/TODO.md b/TODO.md index e39af67..fae582d 100644 --- a/TODO.md +++ b/TODO.md @@ -31,6 +31,21 @@ * architecture as above * share of category (desktop/server-cloud-iot/labs) +* profiling notes for the plotter: + * A full 75% that can't really be optimized + * 48% pandas graph call + * 27% savefig call + * But this could probably be better: + * 17.2% the main pd.read_sql_query + * 0.6% datatable pivot + * And these things are redundant: + * 0.3% docopt + * 0.9% loading config toml + * 2.2% loading the color cache + * 1.2% reading the names of the datasets :( + * 1.7% the "other" filtering (not huge but probably an easy fix) + * (That all accounts for 99.1% of time, so... not much else to improve!) + * use jinjasql for the query templates? * sanitize everything coming from config.toml, really. diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py index 9ea613e..71abcb1 100755 --- a/brontosaurus_plotter.py +++ b/brontosaurus_plotter.py @@ -38,9 +38,6 @@ m.rcParams['font.family'] = 'Montserrat' m.rcParams['legend.frameon'] = False - - - def main(): # TODO: separate initialization (leave in main) from rendering one @@ -358,7 +355,7 @@ def main(): graph.figure.savefig(f"images/{ext}/{dataset}/{Template(params['filebase']).safe_substitute(madlibs)}.{ext}", dpi=config['dpi'], bbox_inches="tight") - graph.figure.savefig(f"images/test.png", dpi=config['dpi'], bbox_inches="tight") + #graph.figure.savefig(f"images/test.png", dpi=config['dpi'], bbox_inches="tight") plt.close(graph.figure) print(f"{Template(params['filebase']).safe_substitute(madlibs)}.{ext}") From adb4a888a98a6927a30e43eed8e050207d8f3d10 Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 26 2022 15:48:49 +0000 Subject: [PATCH 39/49] allow "age+ephemeral" and "age+persistent" --- diff --git a/TODO.md b/TODO.md index fae582d..a05f228 100644 --- a/TODO.md +++ b/TODO.md @@ -3,9 +3,8 @@ 1. Save the color mappings to a file as a separate step * using defaults from config (DONE) 2. change brontosaurus-plotter to render _one_ image per call - * first pass done - * with a syntax for what to include or exclude by name - * and possibly with some number options? + * first pass (DONE) + * with a syntax for what to include or exclude by name (DONE) * add back ephemeral, or is that just a subset of the above? 3. make timeline, releasebar, and waffle be separate commands (DONE) 4. have some script that pre-renders some defaults (PARTIAL) @@ -15,7 +14,11 @@ * epel -- need to special-case EL 8 by-release graphs to add peak for both before and after CentOS Linux 8 EOL -* add "remove bad characters!" from cleanup script +* add "remove bad characters!" to cleanup script + +* add numeric options (less than, greater than) to plotter filter + +* validate that filter items exist in the data? * text reports!!! * this week / last week / year-over-year diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py index 71abcb1..d4731c9 100755 --- a/brontosaurus_plotter.py +++ b/brontosaurus_plotter.py @@ -86,14 +86,31 @@ def main(): print(f"Dataseries '{dataseries}' not in database.") exit(1) + + if arguments['timeseries']: + params['type'] = 'timeseries' + elif arguments['releasebar']: + params['type'] = 'releasebar' + if dataseries == 'release': + print("Plotting release by release makes no sense.") + exit(1) + elif arguments['waffleplot']: + print("Waffle plots not yet implemented.") + exit(1) + + + # this parses the command line for the filter parameters. + # It then constructs SQL from those. + # TODO: stop this nonsense, use a proper ORM + filterincludes = defaultdict(set) filter = "" if arguments['']: for f in arguments['']: - if "'" in f: + if not re.match('^[0-9A-Za-z_ \+\-]*$', f): # TODO: actual, good validation - print("No please.") + print(f"Invalid characters in `{f}`. If this is legit, file a bug please.") exit(2) try: (filterseries,filteritem) = f.split('+',1) @@ -109,14 +126,21 @@ def main(): print(f"Filter `{f}` doesn't match a dataseries. (Try `variant`, `release`, `arch`, or `age`.)") exit(1) - # TODO: stop this nonsense, use a proper ORM for (incseries,incitems) in filterincludes.items(): # TODO parens only if needed filter += " AND (" inclist=set() for incitem in incitems: - inclist.add(f"{incseries} = {incitem}") + # special case age! + if incseries=='age': + if incitem=='persistent': + inclist.add(f"age > 0") + continue + elif incitem=='ephemeral': + inclist.add(f"age = 0") + continue + inclist.add(f"{incseries} = '{incitem}'") filter += " OR ".join(inclist) filter += " ) " @@ -125,14 +149,7 @@ def main(): else: filterstring = "" - if arguments['timeseries']: - params['type'] = 'timeseries' - elif arguments['releasebar']: - params['type'] = 'releasebar' - if dataseries == 'release': - print("Plotting release by release makes no sense.") - exit(1) - # TODO: waffle! + # read in defaults from config.toml params.update(config[params['type']]) From 531ccdbbacb69b865984c9912171be47d22b96ae Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 26 2022 19:59:12 +0000 Subject: [PATCH 40/49] wash better --- diff --git a/TODO.md b/TODO.md index a05f228..0bcaea5 100644 --- a/TODO.md +++ b/TODO.md @@ -18,8 +18,15 @@ * add numeric options (less than, greater than) to plotter filter +* clean up the horrible filter hack code + * cosmetic: group includes and excludes of the same series + so they can be displayed pretty + * validate that the excludes and includes don't overlap * validate that filter items exist in the data? +* cleanup: get my terms straight for + data set, series, axis, column, row, point, item + * text reports!!! * this week / last week / year-over-year * total systems / total persistent / total ephemeral (+%) diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py index d4731c9..4a1fb71 100755 --- a/brontosaurus_plotter.py +++ b/brontosaurus_plotter.py @@ -11,6 +11,7 @@ Options: """ +import os import sqlite3 from string import Template @@ -102,25 +103,30 @@ def main(): # this parses the command line for the filter parameters. # It then constructs SQL from those. # TODO: stop this nonsense, use a proper ORM + # TODO: or really literally anything else filterincludes = defaultdict(set) filter = "" + includelist = set() + excludelist = set() if arguments['']: for f in arguments['']: - if not re.match('^[0-9A-Za-z_ \+\-]*$', f): + if not re.match('^[0-9A-Za-z_ =\!\-]*$', f): # TODO: actual, good validation print(f"Invalid characters in `{f}`. If this is legit, file a bug please.") exit(2) try: - (filterseries,filteritem) = f.split('+',1) + (filterseries,filteritem) = f.split('=',1) filterincludes[filterseries].add(filteritem) + includelist.add(f"{filterseries} = {filteritem}") except ValueError: try: - (filterseries,filteritem) = f.split('-',1) + (filterseries,filteritem) = f.split('!',1) filter+=f" AND {filterseries} != '{filteritem}'" + excludelist.add(f"{filterseries} = {filteritem}") except ValueError: - print(f"Filter `{f}` is not valid. Must be `dataseries+item` or `dataseries-item`") + print(f"Filter `{f}` is not valid. Must be `dataseries:item` or `dataseries-item`") exit(1) if not filterseries in dataserieses: print(f"Filter `{f}` doesn't match a dataseries. (Try `variant`, `release`, `arch`, or `age`.)") @@ -145,11 +151,20 @@ def main(): filter += " ) " if filter: - filterstring = ":" + ":".join(sorted(arguments[''])).replace(" ","_") + filterstring = "_" + "_".join(sorted(arguments[''])).replace(" ","_").lower() else: filterstring = "" - + # haccckkkky! + + if includelist: + includetext = "Included: " + ', '.join(includelist) + else: + includetext = "" + if excludelist: + excludetext = "\nExcluded: " + ', '.join(excludelist) + else: + excludetext = "" # read in defaults from config.toml params.update(config[params['type']]) @@ -197,6 +212,9 @@ def main(): """ df = pd.read_sql_query(query, parse_dates='week', con=database) + if len(df) == 0: + print(f"No data for\n${query}") + exit(0) datatable=df.pivot(index=xaxis, columns=dataseries, @@ -339,6 +357,8 @@ def main(): 'view_label': config['view_labels'][params['graph']], 'type_label': params['label'], 'filter' : filterstring, + 'excludes' : excludetext, + 'includes' : includetext, } if 'title' in params: @@ -368,8 +388,13 @@ def main(): # Not sure why these get rotated by default. Unrotate them! plt.xticks(rotation = 0) + for ext in config['image_types']: - graph.figure.savefig(f"images/{ext}/{dataset}/{Template(params['filebase']).safe_substitute(madlibs)}.{ext}", + imagepath = Template(config['imagepath']).safe_substitute(madlibs) + imagepath = Template(imagepath).safe_substitute({ 'filetype': ext }) + if filter and not ( filterstring == "_age=ephemeral" or filterstring != "_age=persistant" ): + imagepath += "/filtered" + graph.figure.savefig(f"{imagepath}/{Template(params['filebase']).safe_substitute(madlibs)}.{ext}", dpi=config['dpi'], bbox_inches="tight") #graph.figure.savefig(f"images/test.png", dpi=config['dpi'], bbox_inches="tight") diff --git a/brontosaurus_washer.sh b/brontosaurus_washer.sh index 1e3b7da..e8758ae 100755 --- a/brontosaurus_washer.sh +++ b/brontosaurus_washer.sh @@ -1,10 +1,14 @@ #!/bin/bash # +if [ "$1" == "-v" ]; then + VERBOSE=1 +fi function counthits() { - true - #echo 'SELECT sum(hits) FROM checkins' | sqlite3 db/bronto.db; + if [ "$VERBOSE" = 1 ]; then + echo 'SELECT sum(hits) FROM checkins' | sqlite3 db/bronto.db; + fi } counthits @@ -73,17 +77,25 @@ counthits # Note that since we regenerate the whole db from totals.db # each week, if something exceeds this threshold later, it will # suddenly appear -THRESHOLD_TOTAL=100 -THRESHOLD_WEEKLY=3 +THRESHOLD_TOTAL=$( echo 'SELECT MAX(total)/100000 FROM ( SELECT variant,SUM(hits) AS total FROM CHECKINS GROUP BY variant );' | sqlite3 ./db/bronto.db ) +THRESHOLD_WEEKLY=$( echo 'SELECT MAX(total)/100000 FROM ( SELECT variant,MAX(hits) AS total FROM CHECKINS GROUP BY variant );' | sqlite3 ./db/bronto.db ) + for GROUP in $(echo 'SELECT DISTINCT(dataset) FROM checkins;' | sqlite3 ./db/bronto.db); do - sqlite3 db/bronto.db << EOF - DELETE FROM checkins WHERE dataset = "$GROUP" AND variant IN (SELECT variant FROM checkins WHERE dataset = "$GROUP" GROUP BY variant HAVING SUM(hits) < $THRESHOLD_TOTAL); - DELETE FROM checkins WHERE dataset = "$GROUP" AND variant IN (SELECT variant FROM checkins WHERE dataset = "$GROUP" GROUP BY variant HAVING SUM(hits) < $THRESHOLD_WEEKLY); - DELETE FROM checkins WHERE dataset = "$GROUP" AND arch IN (SELECT arch FROM checkins WHERE dataset = "$GROUP" GROUP BY arch HAVING SUM(hits) < $THRESHOLD_TOTAL); - DELETE FROM checkins WHERE dataset = "$GROUP" AND arch IN (SELECT arch FROM checkins WHERE dataset = "$GROUP" GROUP BY arch HAVING SUM(hits) < $THRESHOLD_WEEKLY); - DELETE FROM checkins WHERE dataset = "$GROUP" AND release IN (SELECT release FROM checkins WHERE dataset = "$GROUP" GROUP BY release HAVING SUM(hits) < $THRESHOLD_TOTAL); - DELETE FROM checkins WHERE dataset = "$GROUP" AND release IN (SELECT release FROM checkins WHERE dataset = "$GROUP" GROUP BY release HAVING SUM(hits) < $THRESHOLD_WEEKLY); + # hello, hack! + if [[ "$GROUP" == "epel" ]]; then + THRESHOLD_TOTAL=$(( THRESHOLD_TOTAL * 2 )) + THRESHOLD_WEEKLY=$(( THRESHOLD_WEEKLY * 2 )) + fi + sqlite3 db/bronto.db << EOF + BEGIN; + DELETE FROM checkins WHERE dataset = "$GROUP" AND variant IN (SELECT variant FROM checkins WHERE dataset = "$GROUP" GROUP BY variant HAVING SUM(hits) < $THRESHOLD_TOTAL); + DELETE FROM checkins WHERE dataset = "$GROUP" AND variant IN (SELECT variant FROM checkins WHERE dataset = "$GROUP" GROUP BY variant HAVING MAX(hits) < $THRESHOLD_WEEKLY); + DELETE FROM checkins WHERE dataset = "$GROUP" AND arch IN (SELECT arch FROM checkins WHERE dataset = "$GROUP" GROUP BY arch HAVING SUM(hits) < $THRESHOLD_TOTAL); + DELETE FROM checkins WHERE dataset = "$GROUP" AND arch IN (SELECT arch FROM checkins WHERE dataset = "$GROUP" GROUP BY arch HAVING MAX(hits) < $THRESHOLD_WEEKLY); + DELETE FROM checkins WHERE dataset = "$GROUP" AND release IN (SELECT release FROM checkins WHERE dataset = "$GROUP" GROUP BY release HAVING SUM(hits) < $THRESHOLD_TOTAL); + DELETE FROM checkins WHERE dataset = "$GROUP" AND release IN (SELECT release FROM checkins WHERE dataset = "$GROUP" GROUP BY release HAVING MAX(hits) < $THRESHOLD_WEEKLY); + COMMIT; EOF counthits done diff --git a/config.toml b/config.toml index 6db9d5f..8a3bfee 100644 --- a/config.toml +++ b/config.toml @@ -3,20 +3,23 @@ datafile = "db/bronto.db" color_presets = "color-presets.toml" color_cache = "db/color-cache.toml" -imagepath="images/$filetype/$dataset/" +imagepath = "images/$filetype/$dataset" figsize = [16, 9] dpi = 150 -# Entries where the highest value for any +# Columns where the highest value for any # week (or release) is not above this # percent of the total for that week (or # release) will be binned together into -# "other" +# "other". This also applies (separately!) +# to columns where the total _cumulatively_ +# does not exceed this percent of the +# total of all columns. minpercent = 0.5 + - -# Also bin excess entries with "other". +# Also bin excess columns with "other". # Note that this limit *does* include the # "other" line, if any. maxitems = 10 @@ -78,7 +81,7 @@ week="per week" [graph_defaults] table="checkins" title="$dataset_label: $dataseries_label $type_label" -subtitle="$filter" +subtitle="$includes $excludes" filebase="$timestamp-$dataset-$type-$graph-$dataseries$filter" # TODO: list possible options! @@ -91,7 +94,7 @@ extraselect="" label="by release" table="peak" xaxis = "release" -subtitle="$filter (at release peak)" +subtitle="$includes $excludes (at release peak)" [waffleplot] subtitle="tk!" \ No newline at end of file diff --git a/run.sh b/run.sh index 0475084..fcebfbd 100755 --- a/run.sh +++ b/run.sh @@ -44,6 +44,8 @@ echo -n "* Fossilizing ancient images... " rm images/png/* 2> /dev/null rm images/svg/*/* 2> /dev/null rm images/png/*/* 2> /dev/null + rm images/svg/*/*/* 2> /dev/null + rm images/png/*/*/* 2> /dev/null echo " buried." echo -n "* Slicing brontosauruses... " @@ -80,7 +82,7 @@ echo " binaried." echo -n "* Creating exhibit cages..." for dataset in $(echo 'SELECT DISTINCT(dataset) FROM checkins;' | sqlite3 ./db/bronto.db); do - mkdir -p images/{svg,png}/$dataset + mkdir -p images/{svg,png}/$dataset/filtered done echo " built!" @@ -92,21 +94,5 @@ echo " vibrant!" echo "* Drawing portraits from the fossilized remains... " #LINES=$(toml -dj < config.toml |jq '.timeseries[].views.value[].value'|wc -l) #./brontosaurus_plotter.py | pv -F " %p %e" -w60 -l -s $LINES > /dev/null - - #if [[ $? != 0 ]]; then - # echo "! Oops." - # exit 1 - # fi - for dataset in fedora_updates_systems epel fedora_rawhide_systems fedora_updates_containers fedora_rawhide_containers; do - for dataseries in age arch release variant; do - for graph in stacked share line; do - ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries - done - done - for dataseries in age arch variant; do - for graph in stacked share; do - ./brontosaurus_plotter.py releasebar $graph $dataset $dataseries - done - done - done +./bronosaurus_plotall.sh echo " Beautiful." From c3aba618ab2721a5814146bc31d8de0cbe5103d6 Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 26 2022 19:59:28 +0000 Subject: [PATCH 41/49] this is temporary --- diff --git a/brontosaurus_plotall.sh b/brontosaurus_plotall.sh new file mode 100755 index 0000000..4619aa7 --- /dev/null +++ b/brontosaurus_plotall.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +for dataset in fedora_updates_systems epel fedora_rawhide_systems fedora_updates_containers fedora_rawhide_containers; do + for dataseries in age arch release variant; do + for graph in stacked share line; do + ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries + if [ "$dataseries" != "age"]; fi + ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries age=persistent + ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries age=ephemeral + fi + for sub in release variant; do + INC=$(echo "select $sub from checkins where dataset='$dataset' group by $sub;"|sqlite3 ./db/bronto.db ) + for inc in $INC; do + ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries $sub=$inc + if [ "$dataseries" != "age"]; fi + ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries $sub=$inc age=persistent + ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries $sub=$inc age=ephemeral + fi + done + done + done + done + for dataseries in age arch variant; do + for graph in stacked share; do + ./brontosaurus_plotter.py releasebar $graph $dataset $dataseries + if [ "$dataseries" != "age"]; fi + ./brontosaurus_plotter.py releasebar $graph $dataset $dataseries age=persistent + ./brontosaurus_plotter.py releasebar $graph $dataset $dataseries age=emphemeral + fi + for sub in age variant; do + INC=$(echo "select $sub from checkins where dataset='$dataset' group by $sub;"|sqlite3 ./db/bronto.db ) + for inc in $INC; do + ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries $sub=$inc + if [ "$dataseries" != "age" ] && [ "$sub" != "age" ]; fi + ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries $sub=$inc age=persistent + ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries $sub=$inc age=ephemeral + fi + done + done + done + done + done From 270b55d476159ea2359fef940361d7c0dcdbe99b Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 26 2022 19:59:58 +0000 Subject: [PATCH 42/49] speling --- diff --git a/run.sh b/run.sh index fcebfbd..3f8c885 100755 --- a/run.sh +++ b/run.sh @@ -94,5 +94,5 @@ echo " vibrant!" echo "* Drawing portraits from the fossilized remains... " #LINES=$(toml -dj < config.toml |jq '.timeseries[].views.value[].value'|wc -l) #./brontosaurus_plotter.py | pv -F " %p %e" -w60 -l -s $LINES > /dev/null -./bronosaurus_plotall.sh +./brontosaurus_plotall.sh echo " Beautiful." From fafce2140a2a33b54e683a17ae26410892e9821e Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 26 2022 20:01:00 +0000 Subject: [PATCH 43/49] sshhhh you didn't see that --- diff --git a/brontosaurus_plotall.sh b/brontosaurus_plotall.sh index 4619aa7..6b53425 100755 --- a/brontosaurus_plotall.sh +++ b/brontosaurus_plotall.sh @@ -4,7 +4,7 @@ for dataset in fedora_updates_systems epel fedora_rawhide_systems fedora_updates for dataseries in age arch release variant; do for graph in stacked share line; do ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries - if [ "$dataseries" != "age"]; fi + if [ "$dataseries" != "age"]; then ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries age=persistent ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries age=ephemeral fi @@ -12,7 +12,7 @@ for dataset in fedora_updates_systems epel fedora_rawhide_systems fedora_updates INC=$(echo "select $sub from checkins where dataset='$dataset' group by $sub;"|sqlite3 ./db/bronto.db ) for inc in $INC; do ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries $sub=$inc - if [ "$dataseries" != "age"]; fi + if [ "$dataseries" != "age"]; then ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries $sub=$inc age=persistent ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries $sub=$inc age=ephemeral fi @@ -23,7 +23,7 @@ for dataset in fedora_updates_systems epel fedora_rawhide_systems fedora_updates for dataseries in age arch variant; do for graph in stacked share; do ./brontosaurus_plotter.py releasebar $graph $dataset $dataseries - if [ "$dataseries" != "age"]; fi + if [ "$dataseries" != "age"]; then ./brontosaurus_plotter.py releasebar $graph $dataset $dataseries age=persistent ./brontosaurus_plotter.py releasebar $graph $dataset $dataseries age=emphemeral fi @@ -31,7 +31,7 @@ for dataset in fedora_updates_systems epel fedora_rawhide_systems fedora_updates INC=$(echo "select $sub from checkins where dataset='$dataset' group by $sub;"|sqlite3 ./db/bronto.db ) for inc in $INC; do ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries $sub=$inc - if [ "$dataseries" != "age" ] && [ "$sub" != "age" ]; fi + if [ "$dataseries" != "age" ] && [ "$sub" != "age" ]; then ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries $sub=$inc age=persistent ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries $sub=$inc age=ephemeral fi From 363cbdb73476fd8aff4bd9c72dc85b8ecefb5828 Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 26 2022 20:01:23 +0000 Subject: [PATCH 44/49] everhything is terrible --- diff --git a/brontosaurus_plotall.sh b/brontosaurus_plotall.sh index 6b53425..37ee2af 100755 --- a/brontosaurus_plotall.sh +++ b/brontosaurus_plotall.sh @@ -4,7 +4,7 @@ for dataset in fedora_updates_systems epel fedora_rawhide_systems fedora_updates for dataseries in age arch release variant; do for graph in stacked share line; do ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries - if [ "$dataseries" != "age"]; then + if [ "$dataseries" != "age" ]; then ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries age=persistent ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries age=ephemeral fi @@ -12,7 +12,7 @@ for dataset in fedora_updates_systems epel fedora_rawhide_systems fedora_updates INC=$(echo "select $sub from checkins where dataset='$dataset' group by $sub;"|sqlite3 ./db/bronto.db ) for inc in $INC; do ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries $sub=$inc - if [ "$dataseries" != "age"]; then + if [ "$dataseries" != "age" ]; then ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries $sub=$inc age=persistent ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries $sub=$inc age=ephemeral fi @@ -23,7 +23,7 @@ for dataset in fedora_updates_systems epel fedora_rawhide_systems fedora_updates for dataseries in age arch variant; do for graph in stacked share; do ./brontosaurus_plotter.py releasebar $graph $dataset $dataseries - if [ "$dataseries" != "age"]; then + if [ "$dataseries" != "age" ]; then ./brontosaurus_plotter.py releasebar $graph $dataset $dataseries age=persistent ./brontosaurus_plotter.py releasebar $graph $dataset $dataseries age=emphemeral fi From 45a7469c65ad3d47a5c65ab465485cf10856d0e1 Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 26 2022 21:09:13 +0000 Subject: [PATCH 45/49] ok, definitely done for today :) --- diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py index 4a1fb71..230fdda 100755 --- a/brontosaurus_plotter.py +++ b/brontosaurus_plotter.py @@ -392,7 +392,7 @@ def main(): for ext in config['image_types']: imagepath = Template(config['imagepath']).safe_substitute(madlibs) imagepath = Template(imagepath).safe_substitute({ 'filetype': ext }) - if filter and not ( filterstring == "_age=ephemeral" or filterstring != "_age=persistant" ): + if filter and not ( filterstring == "_age=ephemeral" or filterstring == "_age=persistant" ): imagepath += "/filtered" graph.figure.savefig(f"{imagepath}/{Template(params['filebase']).safe_substitute(madlibs)}.{ext}", dpi=config['dpi'], bbox_inches="tight") From bceef7b54df2e2fc1659a6682c6e88dbc2ba07f0 Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 26 2022 21:28:43 +0000 Subject: [PATCH 46/49] don't do "other" for age --- diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py index 230fdda..fbdc262 100755 --- a/brontosaurus_plotter.py +++ b/brontosaurus_plotter.py @@ -226,50 +226,52 @@ def main(): # datatable.resample('W-MON') #pprint(datatable) - # Find the items below thresholds for percent in any given - # dataseries entry, and also for excess number of items, - # and bin them together into "other" - # - # We consider three things: - # - # * percent of at least one row (week or release - # depending on chart type) must exceed minpercent - # - # * percent of total must also. - # - # * but wait, if the percent of any of the last four - # rows is above the threshold, keep that after all - # - # TODO: weight this towards the end of the data, so we don't drop - # emerging interesting things in favor of old news? - # - # old way: toosmall = datatable.div(datatable.sum(axis=1), axis=0).max() < config['minpercent'] / 100 - # - toosmall = datatable[datatable==datatable.max()].div(datatable.sum(axis=1),axis=0).max() < config['minpercent'] / 100 - toosmall |= datatable.sum()/datatable.sum().sum() < config['minpercent'] / 100 - toosmall &= datatable[-4:].div(datatable.sum(axis=1),axis=0).fillna(0).max() < config['minpercent'] / 100 - others = toosmall[toosmall == True].keys() - othercol = datatable[others].sum(axis=1).astype("Int64") - datatable.drop(columns=others, inplace=True) - - # still too big? - if len(datatable.columns) > config['maxitems']: - # the -1 in `config['maxitems']-1` is so we don't exceed the - # limit by adding the "others" column! - others = datatable.sum(axis=0).sort_values(ascending=False)[config['maxitems']-1:].index - othercol += datatable[others].sum(axis=1).astype("Int64") - datatable.drop(columns=others, inplace=True) - - # if the remaining "other" ends up big enough to matter, add it to the table - # the division is: highest row (release, say) for the item, compared to the total for that row - if othercol.any() and othercol.max() / datatable.sum(axis=1).max() >= config['minpercent'] / 100: - datatable['other'] = othercol - # For bar charts, drop any rows (bars) which are below the threshold - if kind == 'bar': - toosmall=datatable.sum(axis=1)/datatable.sum(axis=1).max() < config['minpercent'] / 100 - datatable.drop(toosmall[toosmall == True].keys(), inplace=True) + if dataseries != 'age': + # Find the items below thresholds for percent in any given + # dataseries entry, and also for excess number of items, + # and bin them together into "other" + # + # We consider three things: + # + # * percent of at least one row (week or release + # depending on chart type) must exceed minpercent + # + # * percent of total must also. + # + # * but wait, if the percent of any of the last four + # rows is above the threshold, keep that after all + # + # TODO: weight this towards the end of the data, so we don't drop + # emerging interesting things in favor of old news? + # + # old way: toosmall = datatable.div(datatable.sum(axis=1), axis=0).max() < config['minpercent'] / 100 + # + toosmall = datatable[datatable==datatable.max()].div(datatable.sum(axis=1),axis=0).max() < config['minpercent'] / 100 + toosmall |= datatable.sum()/datatable.sum().sum() < config['minpercent'] / 100 + toosmall &= datatable[-4:].div(datatable.sum(axis=1),axis=0).fillna(0).max() < config['minpercent'] / 100 + others = toosmall[toosmall == True].keys() + othercol = datatable[others].sum(axis=1).astype("Int64") + datatable.drop(columns=others, inplace=True) + # still too big? + if len(datatable.columns) > config['maxitems']: + # the -1 in `config['maxitems']-1` is so we don't exceed the + # limit by adding the "others" column! + others = datatable.sum(axis=0).sort_values(ascending=False)[config['maxitems']-1:].index + othercol += datatable[others].sum(axis=1).astype("Int64") + datatable.drop(columns=others, inplace=True) + + # if the remaining "other" ends up big enough to matter, add it to the table + # the division is: highest row (release, say) for the item, compared to the total for that row + if othercol.any() and othercol.max() / datatable.sum(axis=1).max() >= config['minpercent'] / 100: + datatable['other'] = othercol + + # For bar charts, drop any rows (bars) which are below the threshold + if kind == 'bar': + toosmall=datatable.sum(axis=1)/datatable.sum(axis=1).max() < config['minpercent'] / 100 + datatable.drop(toosmall[toosmall == True].keys(), inplace=True) + # If the items in this dataset aren't numeric, # sort columns by weight From cbda02742ce46c969323affd59b3f3bf3be627c7 Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 26 2022 21:57:33 +0000 Subject: [PATCH 47/49] wip --- diff --git a/TODO.md b/TODO.md index 0bcaea5..317d876 100644 --- a/TODO.md +++ b/TODO.md @@ -10,6 +10,8 @@ 4. have some script that pre-renders some defaults (PARTIAL) 5. and a simple front-end for exploring the rest +* option to center date on highest peak (and given number of months on + each side * epel -- need to special-case EL 8 by-release graphs to add peak for both before and after CentOS Linux 8 EOL diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py index fbdc262..c865b40 100755 --- a/brontosaurus_plotter.py +++ b/brontosaurus_plotter.py @@ -373,19 +373,31 @@ def main(): Template(params['subtitle']).safe_substitute(madlibs), fontsize=14) - plt.autoscale(enable=True, axis='x', tight=True) + #plt.autoscale(enable=True, axis='x', tight=True) plt.autoscale(enable=True, axis='y', tight=False) graph.set_ylim([0, None]) graph.spines['right'].set_visible(False) graph.spines['top'].set_visible(False) graph.yaxis.set_major_formatter(yformatter) - # graph.xaxis.set_major_formatter(dates.DateFormatter('%b \'%y')) graph.set_xlabel('') - # aesthetic pickiness! - if kind == 'bar': - ax.tick_params(bottom=False) + + + + match kind: + case 'bar': + # aesthetic pickiness! + ax.tick_params(bottom=False) + case _: + # pretty date labels + + label_format = '{:,%b %Y}' + ax.xaxis.set_major_locator(m.ticker.MaxNLocator(3)) + ticks_loc = ax.get_xticks().tolist() + ax.xaxis.set_major_locator(m.ticker.FixedLocator(ticks_loc)) + ax.set_xticklabels([label_format.format(x) for x in ticks_loc]) + ax.figure.autofmt_xdate(rotation=0, ha='center') # Not sure why these get rotated by default. Unrotate them! plt.xticks(rotation = 0) From f01f7e58c3283e8ac1758379f6cb1466fac93a80 Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 27 2022 21:57:46 +0000 Subject: [PATCH 48/49] well. that was ridiculous. but it works! --- diff --git a/TODO.md b/TODO.md index 317d876..192982e 100644 --- a/TODO.md +++ b/TODO.md @@ -13,6 +13,8 @@ * option to center date on highest peak (and given number of months on each side +* consider making date in bronto.db be the _end_ of the week +* * epel -- need to special-case EL 8 by-release graphs to add peak for both before and after CentOS Linux 8 EOL diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py index c865b40..ee14361 100755 --- a/brontosaurus_plotter.py +++ b/brontosaurus_plotter.py @@ -11,13 +11,11 @@ Options: """ -import os import sqlite3 from string import Template from pprint import pprint -import pandas as pd import toml import re @@ -25,8 +23,13 @@ import re import matplotlib as m import matplotlib.pyplot as plt + +import pandas as pd + from docopt import docopt +from datetime import datetime, timezone + from collections import defaultdict from brontosaurusifier_utils import colormapping @@ -325,6 +328,7 @@ def main(): colormap=colormap, kind=kind, stacked=stacked, + xlim=[datetime.strptime("2021-01-01",'%Y-%m-%d'),datetime.strptime(timestamp,'%Y-%m-%d')] ) # Labels and titles and stuff. @@ -373,7 +377,7 @@ def main(): Template(params['subtitle']).safe_substitute(madlibs), fontsize=14) - #plt.autoscale(enable=True, axis='x', tight=True) + plt.autoscale(enable=True, axis='y', tight=False) graph.set_ylim([0, None]) graph.spines['right'].set_visible(False) @@ -383,21 +387,21 @@ def main(): - - match kind: case 'bar': # aesthetic pickiness! ax.tick_params(bottom=False) + plt.autoscale(enable=True, axis='x', tight=True) case _: # pretty date labels - - label_format = '{:,%b %Y}' - ax.xaxis.set_major_locator(m.ticker.MaxNLocator(3)) - ticks_loc = ax.get_xticks().tolist() - ax.xaxis.set_major_locator(m.ticker.FixedLocator(ticks_loc)) - ax.set_xticklabels([label_format.format(x) for x in ticks_loc]) - ax.figure.autofmt_xdate(rotation=0, ha='center') + # + # This is horrific, but I can't get matplotlib and pandas to cooperate, + # so for some reason matplotlib sees this as week numbers. + ax.xaxis.set_major_formatter(m.ticker.FuncFormatter(lambda x, _: datetime.fromtimestamp(604800*x-1,timezone.utc).strftime("%B\n%Y"))) + # hide last label so it doesn't overlap. + ltick=ax.xaxis.get_major_ticks()[-1] + ltick.label1.set_visible(False) + ltick.tick1line.set_visible(False) # Not sure why these get rotated by default. Unrotate them! plt.xticks(rotation = 0) @@ -411,7 +415,7 @@ def main(): graph.figure.savefig(f"{imagepath}/{Template(params['filebase']).safe_substitute(madlibs)}.{ext}", dpi=config['dpi'], bbox_inches="tight") - #graph.figure.savefig(f"images/test.png", dpi=config['dpi'], bbox_inches="tight") + #graph.figure.savefig(f"images/test.png", dpi=config['dpi']/2, bbox_inches="tight") plt.close(graph.figure) print(f"{Template(params['filebase']).safe_substitute(madlibs)}.{ext}") From 3e1e7612c87e55efcd170a902754b160ded891e2 Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jul 28 2022 11:29:50 +0000 Subject: [PATCH 49/49] spelling --- diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py index ee14361..5914abb 100755 --- a/brontosaurus_plotter.py +++ b/brontosaurus_plotter.py @@ -410,7 +410,7 @@ def main(): for ext in config['image_types']: imagepath = Template(config['imagepath']).safe_substitute(madlibs) imagepath = Template(imagepath).safe_substitute({ 'filetype': ext }) - if filter and not ( filterstring == "_age=ephemeral" or filterstring == "_age=persistant" ): + if filter and not ( filterstring == "_age=ephemeral" or filterstring == "_age=persistent" ): imagepath += "/filtered" graph.figure.savefig(f"{imagepath}/{Template(params['filebase']).safe_substitute(madlibs)}.{ext}", dpi=config['dpi'], bbox_inches="tight")