From 7d8c5a9e7501b924e26403d681d2197318437dc6 Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jan 27 2022 22:02:28 +0000
Subject: [PATCH 1/49] Wait no -- THIS is the starting point. Don't dig below here.

Merge branch 'parameterizing' into default

---

diff --git a/NOTES.md b/NOTES.md
index 2769b7a..bd42553 100644
--- a/NOTES.md
+++ b/NOTES.md
@@ -14,6 +14,3 @@ silly ones. We do still keep the architecture match, though.
 We could do some other interesting reports on the outlier data — queries
 that come from different OSes, or across architectures. These are usually
 cross-compile builds or container cases.
-
-The database uses "passel" instead of "group" because SQL is silly. I try
-not to expose that to... anything visible outside the code.
\ No newline at end of file
diff --git a/TODO.md b/TODO.md
index 45caae0..2f78dc7 100644
--- a/TODO.md
+++ b/TODO.md
@@ -1,16 +1,3 @@
-* OK FINE USE "dataset" instead of "passel"
-
-* First thing: change the format so "group" is a thing instead of separate
-  tables.
-
-* GALAXY BRAIN: render charts from the nc files rather than having
-  a csv intermediary. then we won't have to split out ephemeral/persistent
-  (Earlier merely big brain: I was thinking that different data sets like
-  epel and fedora-updates would have very different reports. But I 
-  decided to make them the same after all. So it's just an extra step.)
-
-* still write out some CSV files because they're handy!
-
 * text reports!!!
   * this week / last week / year-over-year
      * total systems / total persistent / total ephemeral (+%)
@@ -25,6 +12,9 @@
      * architecture as above
      * share of category (desktop/server-cloud-iot/labs)
 
+* use jinjasql for the query templates!
+* sanitize everything coming from config.toml, really.
+
 * for the slicer, put the groups in their definitions in the config.toml
 
 * be smarter about which timeseries to make
@@ -76,6 +66,10 @@
       * desktop,server
 
 
+* I guess we should make it so the timeseries definitions can loop over multiple datasets to avoid
+  a lot of redundancy. Or at least, to apply to all Fedora datasets? (Yes, that: introduce a "distro" grouping.)
+  - need a way to actually include multiple datasets at once though, like for the fedora linux + epel graph
+
 * predefined colors for some things
 
 * fix it so colors don't overlap when there's more than 12 options.
@@ -90,6 +84,8 @@
 * something is messed up with the old waffle chart code. throw away, start
   again 
 
+* sanatize all values read from config.toml
+
 * useful waffle charts (show current week, maybe average last 2-4):
   * full [arch,variant,release] (with different shape for ephemeral!)
   * Breakouts (multiple charts per file?)
diff --git a/brontosaurus-egg-sorter.py b/brontosaurus-egg-sorter.py
index d26b98e..e8dff2e 100755
--- a/brontosaurus-egg-sorter.py
+++ b/brontosaurus-egg-sorter.py
@@ -104,7 +104,7 @@ onecounter = Counter()
 
 
 loopcursor.execute(
-    "SELECT DISTINCT(passel) FROM checkins ORDER BY passel DESC")
+    "SELECT DISTINCT(dataset) FROM checkins ORDER BY dataset DESC")
 groups = [item for sublist in loopcursor.fetchall() for item in sublist]
 
 
@@ -113,7 +113,7 @@ for group in groups:
     onecounter.clear()
 
     loopcursor.execute(
-        'SELECT * FROM checkins WHERE passel = :passel AND age = 1 ORDER BY week', {'passel': group})
+        'SELECT * FROM checkins WHERE dataset = :dataset AND age = 1 ORDER BY week', {'dataset': group})
     for row in loopcursor:
         (week, group, release, variant, arch, age, hits) = row
 
@@ -122,7 +122,7 @@ for group in groups:
         # get the other age groups for this type of system, if any
         query = """SELECT age,hits FROM checkins WHERE
                               week = :week AND
-                              passel = :passel AND
+                              dataset = :dataset AND
                               release = :release AND
                               variant = :variant AND
                               arch = :arch AND
@@ -131,7 +131,7 @@ for group in groups:
                               """
         nextcursor.execute(query,
                            {"week": week,
-                            "passel": group,
+                            "dataset": group,
                             "release": release,
                             "variant": variant,
                             "arch": arch
@@ -158,7 +158,7 @@ for group in groups:
         if group.split('_', 1)[0] == 'fedora':
             query = """SELECT age,sum(hits) FROM checkins WHERE
                               week = :nextweek AND
-                              passel = :passel AND
+                              dataset = :dataset AND
                               release >= :release AND
                               variant = :variant AND
                               arch = :arch AND
@@ -168,7 +168,7 @@ for group in groups:
         else:
             query = """SELECT age,sum(hits) FROM checkins WHERE
                               week = :nextweek AND
-                              passel = :passel AND
+                              dataset = :dataset AND
                               release = :release AND
                               variant = :variant AND
                               arch = :arch AND
@@ -177,7 +177,7 @@ for group in groups:
                               ORDER BY age"""
         nextcursor.execute(query,
                            {"nextweek": (datetime.datetime.fromisoformat(week) + datetime.timedelta(weeks=1)).strftime("%Y-%m-%d"),  # this was easier when it was weeknums!
-                            "passel": group,
+                            "dataset": group,
                             "release": release,
                             "variant": variant,
                             "arch": arch
@@ -248,10 +248,10 @@ for group in groups:
                 new_zero, new_one, thisone)
 
         nextcursor.execute("""INSERT INTO checkins
-                              (week, passel, release, variant, arch, age, hits)
-                              VALUES (:week, :passel, :release, :variant, :arch, :age, :hits)""",
+                              (week, dataset, release, variant, arch, age, hits)
+                              VALUES (:week, :dataset, :release, :variant, :arch, :age, :hits)""",
                            {"week": week,
-                            "passel": group,
+                            "dataset": group,
                             "release": release,
                             "variant": variant,
                             "arch": arch,
@@ -259,10 +259,10 @@ for group in groups:
                             "hits": new_zero
                             })
         nextcursor.execute("""REPLACE INTO checkins
-                              (week, passel, release, variant, arch, age, hits)
-                              VALUES (:week, :passel, :release, :variant, :arch, :age, :hits)""",
+                              (week, dataset, release, variant, arch, age, hits)
+                              VALUES (:week, :dataset, :release, :variant, :arch, :age, :hits)""",
                            {"week": week,
-                            "passel": group,
+                            "dataset": group,
                             "release": release,
                             "variant": variant,
                             "arch": arch,
diff --git a/brontosaurus-plotter.py b/brontosaurus-plotter.py
index 42b07e8..c3aa3a8 100755
--- a/brontosaurus-plotter.py
+++ b/brontosaurus-plotter.py
@@ -2,9 +2,9 @@
 
 import matplotlib.dates as dates
 import matplotlib.pyplot as plt
+
 import sqlite3
-import os
-import re
+from string import Template
 
 from collections import defaultdict
 from collections import OrderedDict
@@ -25,15 +25,9 @@ m.rcParams['font.size'] = 12
 m.rcParams['font.family'] = 'Montserrat'
 m.rcParams['legend.frameon'] = False
 
-# fix me: define pretty labels in view_defaults
-AGE_LABELS = {'0': 'Ephemeral',
-              '1': 'First week',
-              '2': '2-4 weeks',
-              '3': '5-24 weeks',
-              '4': '25+ weeks'}
-
 
 def get_colors(colormappings, colorlist, dataset, dataseries, items):
+    """This makes colors 'sticky' for the whole run."""
 
     key = dataset + '_' + dataseries
 
@@ -49,260 +43,141 @@ def get_colors(colormappings, colorlist, dataset, dataseries, items):
     return outcolors
 
 
-def graph_timeseries(view, dataframe, colormappings, dataset, dataseries):
+def graph_timeseries(config, colormappings, params, dataframe):
+    """Draws line or area chart for a dataseries over time."""
 
-    # If we find we have missing data, in the future
+    # If we find we have missing data, in the future:
     # dataframe.resample('W-MON')
 
+    dataset = params['dataset']
+    dataseries = params['dataseries']
+
     #################
-    # cull the weak
-    # fixme: accumlate these into "other"
+    # Instead of this, accumulate anything more than 10 into "other"
+    # ... and do it _elsewhere_ (easier to do before pivot anyway!)
     #  + limit number of columns to 10 + other
 
     hidelist = dataframe.div(dataframe.sum(
-        axis=1), axis=0).max() < view['hidepercent']/100
+        axis=1), axis=0).max() < 0.2/100
     dataframe.drop(columns=(hidelist[hidelist == True].keys()), inplace=True)
 
     ##################
     # our colors.
     # the complication here is keeping the same color for the same label
     # across multiple graphs!
-    cmap = m.colors.ListedColormap(get_colors(colormappings, view['colors'],
+    cmap = m.colors.ListedColormap(get_colors(colormappings, config['colors'],
                                               dataset, dataseries, list(dataframe.columns)))
-    cmap_r = m.colors.ListedColormap(cmap.colors[::-1])
 
     ##################
     # and now.... graph it!
 
     # FIXME: this is ugly
-    startdate = view['startdate'][dataset.split('_', 1)[0]]
-
-    # lines
-    graph = dataframe[startdate:].plot(figsize=view['figsize'], colormap=cmap)
-
-    ax = plt.gca()
-    handles, labels = ax.get_legend_handles_labels()
-    if dataseries == 'age':
-        labels = list(map(AGE_LABELS.get, labels))
-    plt.legend(handles, labels, loc='center left', bbox_to_anchor=(1.0, 0.5))
-
-    plt.suptitle(dataset + ": " +
-                 dataseries + " over time", fontsize=24)
-    if view['ephemeral'] != 'all':
-        graph.set_title(view['ephemeral'] + " systems", fontsize=14)
-
-    plt.autoscale(enable=True, axis='x', tight=True)
-    plt.autoscale(enable=True, axis='y', tight=False)
-    graph.set_ylim([0, None])
-    graph.spines['right'].set_visible(False)
-    graph.spines['top'].set_visible(False)
-    sFormatter = m.ticker.ScalarFormatter()
-    sFormatter.set_scientific(False)
-    graph.yaxis.set_major_formatter(sFormatter)
-    # graph.xaxis.set_major_formatter(dates.DateFormatter('%b \'%y'))
-    graph.set_xlabel('')
-
-    graph.figure.savefig('images/svg/' + dataset + '/' + dataset + '-timeseries-' + dataseries +
-                         '-' + view['ephemeral'] + '.svg', dpi=view['dpi'], bbox_inches="tight")
-    graph.figure.savefig('images/png/' + dataset + '/' + dataset + '-timeseries-' + dataseries +
-                         '-' + view['ephemeral'] + '.png', dpi=view['dpi'], bbox_inches="tight")
-
-    plt.close(graph.figure)
-
-    # stacked
-
-    reversed = dataframe[dataframe.columns[::-1]]
-    graph = reversed[startdate:].plot(
-        figsize=view['figsize'], colormap=cmap_r, kind='area')
-
-    ax = plt.gca()
-    handles, labels = ax.get_legend_handles_labels()
-    if dataseries == 'age':
-        labels = list(map(AGE_LABELS.get, labels))
-    plt.legend(handles[::-1], labels[::-1],
-               loc='center left', bbox_to_anchor=(1.0, 0.5))
-
-    plt.suptitle(dataset + ": " +
-                 dataseries + " over time (stacked)", fontsize=24)
-    if view['ephemeral'] != 'all':
-        graph.set_title(view['ephemeral'] + " systems", fontsize=14)
-
-    plt.autoscale(enable=True, axis='x', tight=True)
-    plt.autoscale(enable=True, axis='y', tight=False)
-    graph.set_ylim([0, None])
-    graph.spines['right'].set_visible(False)
-    graph.spines['top'].set_visible(False)
-    # graph.xaxis.set_major_formatter(dates.DateFormatter('%b \'%y'))
-    graph.set_xlabel('')
-
-    graph.figure.savefig('images/svg/' + dataset + '/' + dataset + '-timeseries-' + dataseries +
-                         '-' + view['ephemeral'] + '-stacked.svg', dpi=view['dpi'], bbox_inches="tight")
-    graph.figure.savefig('images/png/' + dataset + '/' + dataset + '-timeseries-' + dataseries +
-                         '-' + view['ephemeral'] + '-stacked.png', dpi=view['dpi'], bbox_inches="tight")
-
-    plt.close(graph.figure)
-
-    # area (share / percent)
-
-    percentframe = dataframe.div(dataframe.sum(axis=1), axis=0)*100
-
-    graph = percentframe[startdate:].plot(
-        figsize=view['figsize'], colormap=cmap, kind='area')
-
-    ax = plt.gca()
-    handles, labels = ax.get_legend_handles_labels()
-    if dataseries == 'age':
-        labels = list(map(AGE_LABELS.get, labels))
-    plt.legend(handles, labels, loc='center left', bbox_to_anchor=(1.0, 0.5))
-
-    plt.suptitle(dataset + ": " +
-                 dataseries + " over time (share)", fontsize=24)
-    if view['ephemeral'] != 'all':
-        graph.set_title(view['ephemeral'] + " systems", fontsize=14)
-
-    plt.autoscale(enable=True, axis='x', tight=True)
-    plt.autoscale(enable=True, axis='y', tight=False)
-    graph.set_ylim([0, None])
-    graph.spines['right'].set_visible(False)
-    graph.spines['top'].set_visible(False)
-    # graph.xaxis.set_major_formatter(dates.DateFormatter('%b \'%y'))
-    graph.set_xlabel('')
-
-    graph.figure.savefig('images/svg/' + dataset + '/' + dataset + '-timeseries-' + dataseries +
-                         '-' + view['ephemeral'] + '-share.svg', dpi=view['dpi'], bbox_inches="tight")
-    graph.figure.savefig('images/png/' + dataset + '/' + dataset + '-timeseries-' + dataseries +
-                         '-' + view['ephemeral'] + '-share.png', dpi=view['dpi'], bbox_inches="tight")
-
-    plt.close(graph.figure)
-
-
-"""
-def graph_average(view, colormappings):
-
-    if view['ephemeral'] == 'all':
-        csvfile = 'csv/' + dataset + '-average-' + \
-            view['columns'] + '-by-' + view['rows'] + '.csv'
-    elif view['ephemeral'] == 'ephemeral':
-        csvfile = 'csv/' + dataset + '-average-' + \
-            view['columns'] + '-by-' + view['rows'] + '-ephemeral.csv'
-    elif view['ephemeral'] == 'persistent':
-        csvfile = 'csv/' + dataset + '-average-' + \
-            view['columns'] + '-by-' + view['rows'] + '-persistent.csv'
-    else:
-        print("Invalid value for 'ephemeral'")
-    print(csvfile)
-
-    dataframe = pd.read_csv(csvfile, header='>', index_col=0)
-
-    for index, row in dataframe.iterrows():
-
-        # values which are less than one box are gonna get combined
-        otherlist = row.div(row.sum()) * \
-            (view['waffle']['rows']*view['waffle']['rows']) < 1
-        otherval = otherval = row[otherlist[otherlist == True].keys()].sum()
-        prunedrow = row.loc[otherlist[otherlist == False].keys()]
-        # if the sum of the discards is big enough to make a block, add an "other" row
-        if (otherval / (prunedrow.sum() + otherval)) * (view['waffle']['rows']*view['waffle']['rows']) >= 1:
-            # print("Debug: adding 'other' entry.")
-            prunedrow['other'] = otherval
-
-        # this makes sure we sort these things by number, rather than by weight
-        # fixme: don't special case here; instead, put sort options in the view definition
-        if view['rows'] == 'age' or view['rows'] == 'release':
-            data = dict(prunedrow.sort_index(ascending=True))
-        else:
-            data = dict(prunedrow.sort_values(ascending=True))
-
-        # keep colors consistent per label
-        colors = get_colors(colormappings, view['colors'],
-                            dataset, view['rows'], data.keys())
-
-        # make the age labels human-readable.
-        # is this the right place to do this? probably not.
-        if view['rows'] == 'age':
-            data = dict(zip(AGE_LABELS.values(), data.values()))
-
-        if view['ephemeral'] != 'all':
-            label = f"{dataset}: {view['rows']} for {view['columns']} {index} ({view['ephemeral']})"
-        else:
-            label = f"{dataset}: {view['rows']} for {view['columns']} {index}"
-
-        total = sum(list(data.values()))
-        fig = plt.figure(
-            FigureClass=Waffle,
-            rows=view['waffle']['rows'],
-            columns=view['waffle']['columns'],
-            values=data,
-            figsize=view['figsize'],
-            rounding_rule='ceil',
-            colors=colors,
-            title={'label': label,
-                   'loc': 'center',
-                   'pad': 48,
-                   'fontdict': {'fontsize': 24}
-                   },
-            legend={
-                'labels': [f"{k} ({v*100/total:.1f}%)" for k, v in data.items()],
-                'loc': 'lower center',
-                'ncol': 5,
-                'framealpha': 0,
-                'bbox_to_anchor': (0.5, -0.2),
-                'fontsize': 14
-            }
-
-        )
+    startdate = config['startdate'][dataset.split('_', 1)[0]]
+
+    for view in params['views']:
+
+        match view:
+            case 'line':
+                df = dataframe[startdate:]
+                kind = 'line'
+                colormap = cmap
+            case 'stacked':
+                df = dataframe[startdate:][dataframe.columns[::-1]]
+                kind = 'area'
+                colormap = m.colors.ListedColormap(cmap.colors[::-1])
+            case 'share':
+                df = dataframe[startdate:].div(
+                    dataframe.sum(axis=1), axis=0)*100
+                kind = 'area'
+                colormap = cmap
+
+                # Start the actual graph
+        graph = df.plot(figsize=config['figsize'],
+                        colormap=colormap, kind=kind)
+
+        # Labels and titles and stuff.
+        ax = plt.gca()
+
+        handles, labels = ax.get_legend_handles_labels()
+
+        # TODO: generalize this
+        if dataseries == 'age':
+            labels = list(map(config['age_labels'].get, labels))
+
+        if view == 'stacked':
+            handles[:] = handles[::-1]
+            labels[:] = labels[::-1]
+
+        plt.legend(handles, labels, loc='center left',
+                   bbox_to_anchor=(1.0, 0.5))
+
+        madlibs = {'dataseries': dataseries,
+                   'dataset': dataset,
+                   'view': view,
+                   'dataseries_label': config['dataseries_labels'][dataseries],
+                   'dataset_label': config['dataset_labels'][dataset],
+                   'view_label': config['view_labels'][view]}
+
+        if 'title' in params:
+            plt.suptitle(Template(params['title']).safe_substitute(madlibs),
+                         fontsize=24)
+
+        # FIX: make work
+        if 'subtitle' in params:
+            graph.set_title(
+                Template(params['subtitle']).safe_substitute(madlibs),
+                fontsize=14)
+
+        plt.autoscale(enable=True, axis='x', tight=True)
+        plt.autoscale(enable=True, axis='y', tight=False)
+        graph.set_ylim([0, None])
+        graph.spines['right'].set_visible(False)
+        graph.spines['top'].set_visible(False)
+        sFormatter = m.ticker.ScalarFormatter()
+        sFormatter.set_scientific(False)
+        graph.yaxis.set_major_formatter(sFormatter)
+        # graph.xaxis.set_major_formatter(dates.DateFormatter('%b \'%y'))
+        graph.set_xlabel('')
+
+        for ext in config['image_types']:
+            graph.figure.savefig(f"images/{ext}/{dataset}/{Template(params['filebase']).safe_substitute(madlibs)}.{ext}",
+                                 dpi=config['dpi'], bbox_inches="tight")
+
+        plt.close(graph.figure)
+        print(f"{Template(params['filebase']).safe_substitute(madlibs)}.{ext}")
 
-        safeindex = re.sub('[\W_-]', '', str(index))
-        basename = dataset + '-waffle-' + view['rows'] + \
-            '-for-' + view['columns'] + '-' + \
-            safeindex + '-' + view['ephemeral']
-
-        fig.savefig('images/svg/' +
-                    dataset + '/' + basename + '.svg', dpi=view['dpi'])
-        fig.savefig('images/png/' +
-                    dataset + '/' + basename + '.png', dpi=view['dpi'])
-
-        plt.close()
-"""
 
 ###########################################
 
 
 def main():
 
-    defaults = toml.load("view-defaults.toml")
+    config = toml.load("config.toml")
     colormappings = defaultdict(OrderedDict)
 
     database = sqlite3.connect(DATAFILE, detect_types=sqlite3.PARSE_DECLTYPES)
     cursor = database.cursor()
-    cursor.execute(
-        "SELECT DISTINCT(passel) FROM checkins ORDER BY passel DESC")
-    groups = [item for sublist in cursor.fetchall() for item in sublist]
-
-    for group in groups:
-        for column in ["release", "variant", "age", "arch"]:
-            # f-string normally dangerous but here we are using
-            # the hard-coded values above, and "group" is also
-            # something we control.
-            df = pd.read_sql_query(f"""SELECT 
-                                        week,
-                                        {column},
-                                        SUM(hits) as hits 
-                                      FROM checkins
-                                      WHERE passel=\"{group}\"
-                                      GROUP BY week,{column}
-                                      ORDER BY week""",
-                                   parse_dates='week',
-                                   con=database)
-
-            graph_timeseries(
-                defaults,
-                colormappings=colormappings,
-                dataframe=df.pivot(index='week', columns=column,
-                                   values='hits').astype("Int64"),
-                dataset=group,
-                dataseries=column
-            )
+
+    for timeseries in config['timeseries']:
+        params = config['timeseries_defaults'].copy()
+        params.update(timeseries)
+
+        query = f"""SELECT week, {params['dataseries']}, SUM(hits) as hits
+                FROM checkins
+                WHERE dataset=\"{params['dataset']}\"
+                {params['extraselect']}
+                GROUP BY week,{params['dataseries']}
+                ORDER BY week
+              """
+        df = pd.read_sql_query(query, parse_dates='week',   con=database)
+
+        graph_timeseries(
+            config=config,
+            colormappings=colormappings,
+            params=params,
+            dataframe=df.pivot(index='week', columns=params['dataseries'],
+                               values='hits').astype("Int64"),
+        )
 
 
 if __name__ == "__main__":
diff --git a/brontosaurus-slicer.sh b/brontosaurus-slicer.sh
index 258f663..8235990 100755
--- a/brontosaurus-slicer.sh
+++ b/brontosaurus-slicer.sh
@@ -5,7 +5,7 @@
 #
 # It splits the records into major groups: EPEL, and then also
 # "main" Fedora Linux systems, Fedora Rawhide, and Fedora Containers.
-# Because "group" is a reserved word in sql, we use "passel".
+# Because "group" is a reserved word in sql, we use "dataset".
 #
 # It removes the os_ prefix, because without repo_ columns there
 # is no ambiguity to resolve.
@@ -27,18 +27,18 @@ DROP TABLE IF EXISTS bronto.checkins;
 
 CREATE TABLE bronto.checkins(
   week INT,
-  passel TEXT,
+  dataset TEXT,
   release TEXT,
   variant TEXT,
   arch TEXT,
   age INT CHECK(age<5),
   hits INT,
-  UNIQUE (week,passel,release,variant,arch,age)
+  UNIQUE (week,dataset,release,variant,arch,age)
 );
 
 INSERT INTO bronto.checkins
     SELECT date(julianday('1970-01-05')+ weeknum * 7) as week,
-           "fedora_updates_systems" AS passel,
+           "fedora_updates_systems" AS dataset,
            os_version AS release,
            os_variant AS variant,
            os_arch AS arch,
@@ -56,7 +56,7 @@ INSERT INTO bronto.checkins
 
 INSERT INTO bronto.checkins
     SELECT date(julianday('1970-01-05')+ weeknum * 7) as week,
-           "fedora_updates_containers" AS passel,
+           "fedora_updates_containers" AS dataset,
            os_version AS release,
            os_variant AS variant,
            os_arch AS arch,
@@ -75,7 +75,7 @@ INSERT INTO bronto.checkins
 
 INSERT INTO bronto.checkins
     SELECT date(julianday('1970-01-05')+ weeknum * 7) as week,
-           "fedora_rawhide_systems" AS passel,
+           "fedora_rawhide_systems" AS dataset,
            os_version AS release,
            os_variant AS variant,
            os_arch AS arch,
@@ -92,7 +92,7 @@ INSERT INTO bronto.checkins
 
 INSERT INTO bronto.checkins
     SELECT date(julianday('1970-01-05')+ weeknum * 7) as week,
-           "fedora_rawhide_containers" AS passel,
+           "fedora_rawhide_containers" AS dataset,
            os_version AS release,
            os_variant AS variant,
            os_arch AS arch,
@@ -110,7 +110,7 @@ INSERT INTO bronto.checkins
 
 INSERT INTO bronto.checkins
     SELECT date(julianday('1970-01-05')+ weeknum * 7) as week,
-           "epel" AS passel,
+           "epel" AS dataset,
            CASE instr(os_version,".") 
                WHEN 0 THEN os_version 
                ELSE substr(os_version,0,instr(os_version,".")) 
diff --git a/brontosaurus-washer.sh b/brontosaurus-washer.sh
index e6a1b5f..8dad9ef 100755
--- a/brontosaurus-washer.sh
+++ b/brontosaurus-washer.sh
@@ -40,10 +40,10 @@ FEDORA_STARTDAY='2020-04-27'
 EPEL_STARTVER=8
 EPEL_STARTDAY='2021-01-01'
 sqlite3 db/bronto.db << EOF
-  DELETE FROM checkins WHERE passel GLOB "fedora*" AND release < $FEDORA_STARTVER;
-  DELETE FROM checkins WHERE passel GLOB "fedora*" AND week < "$FEDORA_STARTDAY";
-  DELETE FROM checkins WHERE passel GLOB "epel*"   AND release < $EPEL_STARTVER;
-  DELETE FROM checkins WHERE passel GLOB "epel*"   AND week < "$EPEL_STARTDAY";
+  DELETE FROM checkins WHERE dataset GLOB "fedora*" AND release < $FEDORA_STARTVER;
+  DELETE FROM checkins WHERE dataset GLOB "fedora*" AND week < "$FEDORA_STARTDAY";
+  DELETE FROM checkins WHERE dataset GLOB "epel*"   AND release < $EPEL_STARTVER;
+  DELETE FROM checkins WHERE dataset GLOB "epel*"   AND week < "$EPEL_STARTDAY";
 EOF
 
 # Clean up entries for name, arch, or release that show up
@@ -59,13 +59,13 @@ EOF
 THRESHOLD_TOTAL=100
 THRESHOLD_WEEKLY=3
 
-for GROUP in $(echo 'SELECT DISTINCT(passel) FROM checkins;' | sqlite3 ./db/bronto.db); do
+for GROUP in $(echo 'SELECT DISTINCT(dataset) FROM checkins;' | sqlite3 ./db/bronto.db); do
    sqlite3 db/bronto.db << EOF
-     DELETE FROM checkins WHERE passel = "$GROUP" AND variant IN (SELECT variant FROM checkins WHERE passel = "$GROUP" GROUP BY variant HAVING SUM(hits) < $THRESHOLD_TOTAL);
-     DELETE FROM checkins WHERE passel = "$GROUP" AND variant IN (SELECT variant FROM checkins WHERE passel = "$GROUP" GROUP BY variant HAVING SUM(hits) < $THRESHOLD_WEEKLY);
-     DELETE FROM checkins WHERE passel = "$GROUP" AND arch    IN (SELECT arch    FROM checkins WHERE passel = "$GROUP" GROUP BY arch    HAVING SUM(hits) < $THRESHOLD_TOTAL);
-     DELETE FROM checkins WHERE passel = "$GROUP" AND arch    IN (SELECT arch    FROM checkins WHERE passel = "$GROUP" GROUP BY arch    HAVING SUM(hits) < $THRESHOLD_WEEKLY);
-     DELETE FROM checkins WHERE passel = "$GROUP" AND release IN (SELECT release FROM checkins WHERE passel = "$GROUP" GROUP BY release HAVING SUM(hits) < $THRESHOLD_TOTAL);
-     DELETE FROM checkins WHERE passel = "$GROUP" AND release IN (SELECT release FROM checkins WHERE passel = "$GROUP" GROUP BY release HAVING SUM(hits) < $THRESHOLD_WEEKLY);
+     DELETE FROM checkins WHERE dataset = "$GROUP" AND variant IN (SELECT variant FROM checkins WHERE dataset = "$GROUP" GROUP BY variant HAVING SUM(hits) < $THRESHOLD_TOTAL);
+     DELETE FROM checkins WHERE dataset = "$GROUP" AND variant IN (SELECT variant FROM checkins WHERE dataset = "$GROUP" GROUP BY variant HAVING SUM(hits) < $THRESHOLD_WEEKLY);
+     DELETE FROM checkins WHERE dataset = "$GROUP" AND arch    IN (SELECT arch    FROM checkins WHERE dataset = "$GROUP" GROUP BY arch    HAVING SUM(hits) < $THRESHOLD_TOTAL);
+     DELETE FROM checkins WHERE dataset = "$GROUP" AND arch    IN (SELECT arch    FROM checkins WHERE dataset = "$GROUP" GROUP BY arch    HAVING SUM(hits) < $THRESHOLD_WEEKLY);
+     DELETE FROM checkins WHERE dataset = "$GROUP" AND release IN (SELECT release FROM checkins WHERE dataset = "$GROUP" GROUP BY release HAVING SUM(hits) < $THRESHOLD_TOTAL);
+     DELETE FROM checkins WHERE dataset = "$GROUP" AND release IN (SELECT release FROM checkins WHERE dataset = "$GROUP" GROUP BY release HAVING SUM(hits) < $THRESHOLD_WEEKLY);
 EOF
 done
diff --git a/config.toml b/config.toml
new file mode 100644
index 0000000..a6c5c58
--- /dev/null
+++ b/config.toml
@@ -0,0 +1,437 @@
+ephemeral = "all"
+
+figsize = [16, 9]
+dpi = 300
+
+
+# Our palette. Note that this also limits the
+# number of items per chart. If there are
+# more than the number of colors, the last
+# color here becomes "other".
+# (TODO! Implement that!)
+colors = [
+    '#51a2da',
+    '#294172',
+    '#afea85',
+    '#db3279',
+    '#f5a326',
+    '#b193c8',
+    '#38bc3b',
+    '#3c6eb4',
+    '#eb7434',
+    '#603e79',
+    '#ffd117',
+    '#aad0ee',
+    '#101010',
+    '#535961',
+]
+
+# could be png, pdf, svg
+# TODO: not yet implemented
+image_types = ["png"]
+
+# todo: change the fedora start date to also 2021-01-01, after DevConf.cz 2022.
+[startdate]
+fedora = '2020-04-27' # week of Fedora 32 release
+epel   = '2021-01-01' # DNF feature launched in 8.3 at end of 2020
+
+[dataset_labels]
+epel = "Extra Packages for Enterprise Linux"
+fedora_updates_systems = "Fedora Linux systems"
+fedora_updates_containers = "Fedora Linux containers"
+fedora_rawhide_systems = "Fedora Rawhide systems"
+fedora_rawhide_containers = "Fedora Rawhide containers"
+
+[dataseries_labels]
+arch="CPU architecture"
+release="release "
+variant="variant"
+age="age category"
+
+[age_labels]
+'0'='Ephemeral'
+'1'='First week'
+'2'='2-4 weeks'
+'3'='5-24 weeks'
+'4'='25+ weeks'
+
+[view_labels]
+'line'=""
+'stacked'=" (stacked)"
+'share'=" (share)"
+
+[timeseries_defaults]
+title="$dataset_label: weekly checkins by $dataseries_label$view_label"
+filebase="$dataset-timeseries-$dataseries-$view"
+extraselect=""
+# not all of these are implemented. But we could have...
+#subtitle=
+#dataset=
+#dataseries=
+#orderbyhits=
+#reverse=
+# todo: back to the idea of reading these 
+# from individual, merged configuration files!
+
+[[timeseries]]
+dataset="fedora_updates_systems"
+dataseries="release"
+views=['line','stacked','share']
+filebase="$dataset-timeseries-$dataseries-$view-all"
+
+[[timeseries]]
+subtitle="ephemeral systems"
+dataset="fedora_updates_systems"
+dataseries="release"
+views=['line','stacked','share']
+extraselect="AND age=0"
+filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+[[timeseries]]
+subtitle="persistent systems"
+dataset="fedora_updates_systems"
+dataseries="release"
+views=['line','stacked','share']
+extraselect="AND age>0"
+filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+[[timeseries]]
+dataset="fedora_updates_systems"
+dataseries="age"
+views=['share','stacked']
+filebase="$dataset-timeseries-$dataseries-$view-all"
+
+[[timeseries]]
+dataset="fedora_updates_systems"
+dataseries="arch"
+views=['line','share']
+filebase="$dataset-timeseries-$dataseries-$view-all"
+
+[[timeseries]]
+subtitle="ephemeral systems"
+dataset="fedora_updates_systems"
+dataseries="arch"
+views=['line','share']
+extraselect="AND age=0"
+filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+[[timeseries]]
+subtitle="persistent systems"
+dataset="fedora_updates_systems"
+dataseries="arch"
+views=['line','share']
+extraselect="AND age>0"
+filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+[[timeseries]]
+dataset="fedora_updates_systems"
+dataseries="variant"
+views=['line','share']
+filebase="$dataset-timeseries-$dataseries-$view-all"
+
+[[timeseries]]
+subtitle="ephemeral systems"
+dataset="fedora_updates_systems"
+dataseries="variant"
+views=['line','share']
+extraselect="AND age=0"
+filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+[[timeseries]]
+subtitle="persistent systems"
+dataset="fedora_updates_systems"
+dataseries="variant"
+views=['line','share']
+extraselect="AND age>0"
+filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+[[timeseries]]
+dataset="fedora_updates_containers"
+dataseries="release"
+views=['line','stacked','share']
+filebase="$dataset-timeseries-$dataseries-$view-all"
+
+[[timeseries]]
+subtitle="ephemeral systems"
+dataset="fedora_updates_containers"
+dataseries="release"
+views=['line','stacked','share']
+extraselect="AND age=0"
+filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+[[timeseries]]
+subtitle="persistent systems"
+dataset="fedora_updates_containers"
+dataseries="release"
+views=['line','stacked','share']
+extraselect="AND age>0"
+filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+[[timeseries]]
+dataset="fedora_updates_containers"
+dataseries="age"
+views=['share','stacked']
+filebase="$dataset-timeseries-$dataseries-$view-all"
+
+[[timeseries]]
+dataset="fedora_updates_containers"
+dataseries="arch"
+views=['line','share']
+filebase="$dataset-timeseries-$dataseries-$view-all"
+
+[[timeseries]]
+subtitle="ephemeral systems"
+dataset="fedora_updates_containers"
+dataseries="arch"
+views=['line','share']
+extraselect="AND age=0"
+filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+[[timeseries]]
+subtitle="persistent systems"
+dataset="fedora_updates_containers"
+dataseries="arch"
+views=['line','share']
+extraselect="AND age>0"
+filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+[[timeseries]]
+dataset="fedora_updates_containers"
+dataseries="variant"
+views=['line','share']
+filebase="$dataset-timeseries-$dataseries-$view-all"
+
+[[timeseries]]
+subtitle="ephemeral systems"
+dataset="fedora_updates_containers"
+dataseries="variant"
+views=['line','share']
+extraselect="AND age=0"
+filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+[[timeseries]]
+subtitle="persistent systems"
+dataset="fedora_updates_containers"
+dataseries="variant"
+views=['line','share']
+extraselect="AND age>0"
+filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+
+[[timeseries]]
+dataset="fedora_rawhide_systems"
+dataseries="release"
+views=['line','stacked','share']
+filebase="$dataset-timeseries-$dataseries-$view-all"
+
+[[timeseries]]
+subtitle="ephemeral systems"
+dataset="fedora_rawhide_systems"
+dataseries="release"
+views=['line','stacked','share']
+extraselect="AND age=0"
+filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+[[timeseries]]
+subtitle="persistent systems"
+dataset="fedora_rawhide_systems"
+dataseries="release"
+views=['line','stacked','share']
+extraselect="AND age>0"
+filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+[[timeseries]]
+dataset="fedora_rawhide_systems"
+dataseries="age"
+views=['share','stacked']
+
+[[timeseries]]
+dataset="fedora_rawhide_systems"
+dataseries="arch"
+views=['line','share']
+filebase="$dataset-timeseries-$dataseries-$view-all"
+
+[[timeseries]]
+subtitle="ephemeral systems"
+dataset="fedora_rawhide_systems"
+dataseries="arch"
+views=['line','share']
+extraselect="AND age=0"
+filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+[[timeseries]]
+subtitle="persistent systems"
+dataset="fedora_rawhide_systems"
+dataseries="arch"
+views=['line','share']
+extraselect="AND age>0"
+filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+[[timeseries]]
+dataset="fedora_rawhide_systems"
+dataseries="variant"
+views=['line','share']
+filebase="$dataset-timeseries-$dataseries-$view-all"
+
+[[timeseries]]
+subtitle="ephemeral systems"
+dataset="fedora_rawhide_systems"
+dataseries="variant"
+views=['line','share']
+extraselect="AND age=0"
+filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+[[timeseries]]
+subtitle="persistent systems"
+dataset="fedora_rawhide_systems"
+dataseries="variant"
+views=['line','share']
+extraselect="AND age>0"
+filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+
+[[timeseries]]
+dataset="fedora_rawhide_containers"
+dataseries="release"
+views=['line','stacked','share']
+filebase="$dataset-timeseries-$dataseries-$view-all"
+
+[[timeseries]]
+subtitle="ephemeral systems"
+dataset="fedora_rawhide_containers"
+dataseries="release"
+views=['line','stacked','share']
+extraselect="AND age=0"
+filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+[[timeseries]]
+subtitle="persistent systems"
+dataset="fedora_rawhide_containers"
+dataseries="release"
+views=['line','stacked','share']
+extraselect="AND age>0"
+filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+[[timeseries]]
+dataset="fedora_rawhide_containers"
+dataseries="age"
+views=['share','stacked']
+filebase="$dataset-timeseries-$dataseries-$view-all"
+
+[[timeseries]]
+dataset="fedora_rawhide_containers"
+dataseries="arch"
+views=['line','share']
+filebase="$dataset-timeseries-$dataseries-$view-all"
+
+[[timeseries]]
+subtitle="ephemeral systems"
+dataset="fedora_rawhide_containers"
+dataseries="arch"
+views=['line','share']
+extraselect="AND age=0"
+filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+[[timeseries]]
+subtitle="persistent systems"
+dataset="fedora_rawhide_containers"
+dataseries="arch"
+views=['line','share']
+extraselect="AND age>0"
+filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+[[timeseries]]
+dataset="fedora_rawhide_containers"
+dataseries="variant"
+views=['line','share']
+filebase="$dataset-timeseries-$dataseries-$view-all"
+
+[[timeseries]]
+subtitle="ephemeral systems"
+dataset="fedora_rawhide_containers"
+dataseries="variant"
+views=['line','share']
+extraselect="AND age=0"
+filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+[[timeseries]]
+subtitle="persistent systems"
+dataset="fedora_rawhide_containers"
+dataseries="variant"
+views=['line','share']
+extraselect="AND age>0"
+filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+
+[[timeseries]]
+dataset="epel"
+dataseries="release"
+views=['line','stacked','share']
+filebase="$dataset-timeseries-$dataseries-$view-all"
+
+[[timeseries]]
+subtitle="ephemeral systems"
+dataset="epel"
+dataseries="release"
+views=['line','stacked','share']
+extraselect="AND age=0"
+filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+[[timeseries]]
+subtitle="persistent systems"
+dataset="epel"
+dataseries="release"
+views=['line','stacked','share']
+extraselect="AND age>0"
+filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+[[timeseries]]
+dataset="epel"
+dataseries="age"
+views=['share','stacked']
+filebase="$dataset-timeseries-$dataseries-$view-all"
+
+[[timeseries]]
+dataset="epel"
+dataseries="arch"
+views=['line','share']
+filebase="$dataset-timeseries-$dataseries-$view-all"
+
+[[timeseries]]
+subtitle="ephemeral systems"
+dataset="epel"
+dataseries="arch"
+views=['line','share']
+extraselect="AND age=0"
+filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+[[timeseries]]
+subtitle="persistent systems"
+dataset="epel"
+dataseries="arch"
+views=['line','share']
+extraselect="AND age>0"
+filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+[[timeseries]]
+dataset="epel"
+dataseries="variant"
+views=['line','share']
+filebase="$dataset-timeseries-$dataseries-$view-all"
+
+[[timeseries]]
+subtitle="ephemeral systems"
+dataset="epel"
+dataseries="variant"
+views=['line','share']
+extraselect="AND age=0"
+filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+[[timeseries]]
+subtitle="persistent systems"
+dataset="epel"
+dataseries="variant"
+views=['line','share']
+extraselect="AND age>0"
+filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
diff --git a/run.sh b/run.sh
index d19d5d5..71512cf 100755
--- a/run.sh
+++ b/run.sh
@@ -71,15 +71,13 @@ echo -n "* Sorting the eggs... "
 echo "  binaried."
 
 echo "* Creating cages for different exhibits..."
-  for dataset in $(echo 'SELECT DISTINCT(passel) FROM checkins;' | sqlite3 ./db/bronto.db); do
+  for dataset in $(echo 'SELECT DISTINCT(dataset) FROM checkins;' | sqlite3 ./db/bronto.db); do
     mkdir -p images/{svg,png}/$dataset
   done
 echo "  Built!"
 
 echo "* Drawing portraits from the fossilized remains... "
-  #LINES=$(ls csv/*.csv |wc -l)
- # FIXME 
- LINES=30
+ LINES=$(toml -dj < config.toml |jq '.timeseries[].views.value[].value'|wc -l)
  ./brontosaurus-plotter.py | pv -F "  %p %e" -w60 -l -s $LINES > /dev/null
   if [[ $? != 0 ]]; then
     echo "! Oops."
diff --git a/view-defaults.toml b/view-defaults.toml
deleted file mode 100644
index bfe46d8..0000000
--- a/view-defaults.toml
+++ /dev/null
@@ -1,51 +0,0 @@
-ephemeral = "all"
-
-figsize = [16, 9]
-dpi = 300
-
-# todo: change the fedora start date to also 2021-01-01, after DevConf.cz 2022.
-
-startdate.fedora = '2020-04-27' # week of Fedora 32 release
-startdate.epel   = '2021-01-01' # DNF feature launched in 8.3 at end of 2020
-
-waffle.columns = 50
-waffle.rows = 20
-
-# hide below this percent
-hidepercent = 0.2
-
-# our palette
-colors = [
-    '#51a2da',
-    '#294172',
-    '#afea85',
-    '#db3279',
-    '#f5a326',
-    '#b193c8',
-    '#38bc3b',
-    '#3c6eb4',
-    '#eb7434',
-    '#603e79',
-    '#ffd117',
-    '#aad0ee',
-    '#101010',
-    '#535961',
-]
-
-#timeseries.types.release = ["line", "stacked", "share"]
-#timeseries.types.variant = ["stacked", "share"]
-#timeseries.types.arch    = ["line", "share"]
-#timeseries.types.age     = ["line", "share"]
-
-#timeseries.ephemeral.release = "combined"
-#timeseries.ephemeral.variant = "separate"
-#timeseries.ephemeral.arch    = "combined"
-#timeseries.ephemeral.age     = "none"
-
-#epel.group.classic = "CentOS Linux"
-#epel.variant_variants = [ {"Without CentOS Linux": "-classic" }]
-
-# TODO: Figure out an expressive way to do this.
-#fedora.group.container = ["container","toolbox","snappy"]
-#fedora.group.server    = ["cloud","coreos","iot","server"]
-#fedora.group.ostree    = ["coreos","iot","kinoite","silverblue"]

From 5115258961f3b622be8fff6993835a41443e4d19 Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jan 27 2022 23:15:39 +0000
Subject: [PATCH 2/49] wip. find the peak weeks.


---

diff --git a/TODO.md b/TODO.md
index 2f78dc7..a258cef 100644
--- a/TODO.md
+++ b/TODO.md
@@ -17,43 +17,7 @@
 
 * for the slicer, put the groups in their definitions in the config.toml
 
-* be smarter about which timeseries to make
-
-  * age already includes 0 and 1-4, so having separate ephemeral/persistent
-    views isn't useful
-
-  * age unstacked line chart isn't really useful -- skip.
-
-  * arch stacked chart isn't super useful either -- share is best, line is
-    ok.
-     - few enough lines that we can probably put ephemeral and persistent 
-        on same chart. 
-
-  * variant stacked also isn't useful -- share is best, line ok
-     - but ephemeral vs persistent is a nightmare!
-
-  * for release, all three are good (but maybe present ephemeral and
-    persistent on same line chart?)
-
-  * secondary timeline charts for variants:
-
-    * epel variants without centos linux (or rhel?)
-
-    * fedora variants with just:
-       * desktop variants
-       * server/cloud/iot variants
-       * labs (compneuro, design suite)
-       * the three above, grouped
-       * ostree vs non-ostree (summed!)
-
-  * the grouped one for arch
-
-  * So, that's:
-
-    * age over time  — share and stacked (no special handling for ephemeral)
-    * arch over time — share and line (ephemeral on same chart?)
-    * variant over time — share and line (ephemeral separate charts)
-    * release — line, share, stacked (ephemeral on same charts?)
+* secondary timeline charts for variants:
 
     * variant variants!
       * epel without CentOS Linux
@@ -81,9 +45,6 @@
     point, not summed (because that's its most interesting!)
   * don't bother with ephemeral/persistent view (age view is enough)
 
-* something is messed up with the old waffle chart code. throw away, start
-  again 
-
 * sanatize all values read from config.toml
 
 * useful waffle charts (show current week, maybe average last 2-4):
@@ -97,11 +58,6 @@
 * make animations by week of full [arch,variant,release]
   * maybe of the breakouts too?
 
-* Instead of a hard-coded thing in the plotter, generalize the
-  table and column-name-to-human-term code. Could also be used for formatting
-  "Mate-Compiz" and the like. 
-
-
     
 * change the timeseries "hide" to collect small things into "other"
 
@@ -124,41 +80,17 @@
   old systems dropping out and being replaced by new ones. (In the latter
   case, we have _fewer_ ephemerial systems than we are currently guessing.)
 
-* clean up the in-triplicate writing for ephemeral, permanent, and all 
-
-* add totals for the waffle charts
-
 * skip waffle charts that will never be interesting
 
-* figure out how to estimate chart time better
-
 * once we have more than a year of data, start Fedora chart at 2021-01-01,
   same as epel, because that initial growth curve is not really
   representative of anything but upgrades and all the initial data
   therefore skewed
 
-* add numeric labels to the waffle charts! ("1 square = nnn systems")
-
-* Add Rawhide as a separate table. Needs special handling because it's hard
-  to sort out development on a regular Fedora OS release vs actually running
-  Rawhide.
-
-* something to make colors consistent
-
-* Filtering out obviously ridiculous data should be done before
-  the "dicer" stage, because otherwise it balloons the dataset.
 
 * Change ./run.sh into a makefile, because old-school.
 
-
-* Rework it so temporary files go in tmpdirs and data goes in var or
-  something (configurable)
-
-
-* Related todo: with the by-release graphs, stop after the release is 
-  no longer current.
-
-* Bonus: separate graphs for "which variants tend to persist after EOL"
+* Can we get anything interesting for "which variants tend to persist after EOL"?
 
 * import estimates from old data
 
@@ -177,6 +109,7 @@
 
 * fix the code in brotosaurus washer to merge '' to 'none' rather than just
   renaming (works now because there are no natural 'none' entries).
+  
 * map "unknown" to "generic"
 
 * instead of throwing away entries in the washing phase (especially those
diff --git a/brontosaurus-fight.sh b/brontosaurus-fight.sh
new file mode 100755
index 0000000..ba14da8
--- /dev/null
+++ b/brontosaurus-fight.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+#
+# Create a view which only shows the weeks where each release
+# is at its peak. If someone actually is Good At SQL, I would
+# not mind help making this more clear.
+
+sqlite3 db/bronto.db << EOF
+    DROP VIEW IF EXISTS peak;
+    CREATE VIEW peak AS
+    SELECT checkins.week,
+           checkins.dataset,
+           checkins.release,
+           checkins.variant,
+           checkins.arch,
+           checkins.age,
+           checkins.hits
+    FROM checkins
+    INNER JOIN
+    (SELECT week,dataset,release,max(hits)
+        FROM (SELECT week,dataset,release,sum(hits) AS hits 
+        FROM  checkins
+        GROUP BY week,dataset,release
+        ORDER BY week) 
+    GROUP BY dataset,release) AS peaks
+    ON peaks.week = checkins.week AND peaks.release = checkins.release;
+EOF
diff --git a/brontosaurus-plotter.py b/brontosaurus-plotter.py
index c3aa3a8..5eb4401 100755
--- a/brontosaurus-plotter.py
+++ b/brontosaurus-plotter.py
@@ -156,17 +156,17 @@ def main():
     colormappings = defaultdict(OrderedDict)
 
     database = sqlite3.connect(DATAFILE, detect_types=sqlite3.PARSE_DECLTYPES)
-    cursor = database.cursor()
-
+    # cursor = database.cursor()
+    '''
     for timeseries in config['timeseries']:
         params = config['timeseries_defaults'].copy()
         params.update(timeseries)
 
         query = f"""SELECT week, {params['dataseries']}, SUM(hits) as hits
                 FROM checkins
-                WHERE dataset=\"{params['dataset']}\"
+                WHERE dataset =\"{params['dataset']}\"
                 {params['extraselect']}
-                GROUP BY week,{params['dataseries']}
+                GROUP BY week, {params['dataseries']}
                 ORDER BY week
               """
         df = pd.read_sql_query(query, parse_dates='week',   con=database)
@@ -178,6 +178,24 @@ def main():
             dataframe=df.pivot(index='week', columns=params['dataseries'],
                                values='hits').astype("Int64"),
         )
+    '''
+
+    # sorry about this.
+    # what it does is: find all the rows from the peak
+    # week for each release.
+    query = """SELECT checkins.week,checkins.dataset,checkins.release,checkins.variant,checkins.arch,checkins.age,checkins.hits from checkins INNER JOIN
+                    (SELECT week,dataset,release,max(hits)
+                        FROM (SELECT week,dataset,release,sum(hits) AS hits 
+                        FROM  checkins
+                        GROUP BY week,dataset,release
+                        ORDER BY week) 
+                    GROUP BY dataset,release) AS peaks
+                    ON peaks.week = checkins.week AND peaks.release = checkins.release
+            """
+
+    at_peak = pd.read_sql_query(query, parse_dates='week',  con=database)
+    pd.set_option('display.max_rows', len(at_peak))
+    print(at_peak)
 
 
 if __name__ == "__main__":
diff --git a/run.sh b/run.sh
index 71512cf..41cfa7d 100755
--- a/run.sh
+++ b/run.sh
@@ -62,6 +62,14 @@ echo -n "* Scrubbing off the dirt... "
   fi  
 echo "  shiny!"
 
+echo -n "* Finding the strongest... "
+  ./brontosaurus-fight.sh
+  if [[ $? != 0 ]]; then
+    echo "! Oops."
+    exit 1
+  fi  
+echo "  rarrhhhhr!"
+
 echo -n "* Sorting the eggs... "
   ./brontosaurus-egg-sorter.py
   if [[ $? != 0 ]]; then

From c3865b17a3d6c30e145b0da5fc7ee1dfd5d515dd Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jan 28 2022 00:10:07 +0000
Subject: [PATCH 3/49] bar charts in progras


---

diff --git a/TODO.md b/TODO.md
index a258cef..018c7f8 100644
--- a/TODO.md
+++ b/TODO.md
@@ -17,6 +17,8 @@
 
 * for the slicer, put the groups in their definitions in the config.toml
 
+* better ordering
+
 * secondary timeline charts for variants:
 
     * variant variants!
@@ -53,7 +55,7 @@
     * age [arch,variant,release] (sort age 4-0 instead of 0-4)
     * variant for arch (different shape for ephemeral)
 
-* EPEL charts with names in labels need fewer columns!
+  * EPEL charts with names in labels need fewer columns!
 
 * make animations by week of full [arch,variant,release]
   * maybe of the breakouts too?
@@ -109,7 +111,7 @@
 
 * fix the code in brotosaurus washer to merge '' to 'none' rather than just
   renaming (works now because there are no natural 'none' entries).
-  
+
 * map "unknown" to "generic"
 
 * instead of throwing away entries in the washing phase (especially those
diff --git a/brontosaurus-fight.sh b/brontosaurus-fight.sh
index ba14da8..5726335 100755
--- a/brontosaurus-fight.sh
+++ b/brontosaurus-fight.sh
@@ -22,5 +22,7 @@ sqlite3 db/bronto.db << EOF
         GROUP BY week,dataset,release
         ORDER BY week) 
     GROUP BY dataset,release) AS peaks
-    ON peaks.week = checkins.week AND peaks.release = checkins.release;
+    ON peaks.week = checkins.week
+       AND peaks.dataset = checkins.dataset 
+       AND peaks.release = checkins.release;
 EOF
diff --git a/brontosaurus-plotter.py b/brontosaurus-plotter.py
index 5eb4401..5248b6c 100755
--- a/brontosaurus-plotter.py
+++ b/brontosaurus-plotter.py
@@ -147,6 +147,106 @@ def graph_timeseries(config, colormappings, params, dataframe):
         print(f"{Template(params['filebase']).safe_substitute(madlibs)}.{ext}")
 
 
+def graph_releasebars(config, colormappings, params, dataframe):
+    """Draws earch release in the set as a bar chart"""
+
+    # If we find we have missing data, in the future:
+    # dataframe.resample('W-MON')
+
+    dataset = params['dataset']
+    dataseries = params['dataseries']
+
+    #################
+    # Instead of this, accumulate anything more than 10 into "other"
+    # ... and do it _elsewhere_ (easier to do before pivot anyway!)
+    #  + limit number of columns to 10 + other
+
+    hidelist = dataframe.div(dataframe.sum(
+        axis=1), axis=0).max() < 0.2/100
+    dataframe.drop(columns=(hidelist[hidelist == True].keys()), inplace=True)
+
+    ##################
+    # our colors.
+    # the complication here is keeping the same color for the same label
+    # across multiple graphs!
+    cmap = m.colors.ListedColormap(get_colors(colormappings, config['colors'],
+                                              dataset, dataseries, list(dataframe.columns)))
+
+    ##################
+    # and now.... graph it!
+
+    # FIXME: this is ugly
+    startdate = config['startdate'][dataset.split('_', 1)[0]]
+
+    for view in params['views']:
+
+        match view:
+            case 'stacked':
+                df = dataframe[startdate:]
+                kind = 'bar'
+                colormap = cmap
+            case 'share':
+                df = dataframe[startdate:].div(
+                    dataframe.sum(axis=1), axis=0)*100
+                kind = 'bar'
+                colormap = cmap
+
+                # Start the actual graph
+        graph = df.plot(figsize=config['figsize'],
+                        colormap=colormap, kind=kind)
+
+        # Labels and titles and stuff.
+        ax = plt.gca()
+
+        handles, labels = ax.get_legend_handles_labels()
+
+        # TODO: generalize this
+        if dataseries == 'age':
+            labels = list(map(config['age_labels'].get, labels))
+
+        if view == 'stacked':
+            handles[:] = handles[::-1]
+            labels[:] = labels[::-1]
+
+        plt.legend(handles, labels, loc='center left',
+                   bbox_to_anchor=(1.0, 0.5))
+
+        madlibs = {'dataseries': dataseries,
+                   'dataset': dataset,
+                   'view': view,
+                   'dataseries_label': config['dataseries_labels'][dataseries],
+                   'dataset_label': config['dataset_labels'][dataset],
+                   'view_label': config['view_labels'][view]}
+
+        if 'title' in params:
+            plt.suptitle(Template(params['title']).safe_substitute(madlibs),
+                         fontsize=24)
+
+        # FIX: make work
+        if 'subtitle' in params:
+            graph.set_title(
+                Template(params['subtitle']).safe_substitute(madlibs),
+                fontsize=14)
+
+        plt.autoscale(enable=True, axis='x', tight=True)
+        plt.autoscale(enable=True, axis='y', tight=False)
+        graph.set_ylim([0, None])
+        graph.spines['right'].set_visible(False)
+        graph.spines['top'].set_visible(False)
+        sFormatter = m.ticker.ScalarFormatter()
+        sFormatter.set_scientific(False)
+        graph.yaxis.set_major_formatter(sFormatter)
+        # graph.xaxis.set_major_formatter(dates.DateFormatter('%b \'%y'))
+        graph.set_xlabel('')
+
+        for ext in config['image_types']:
+            graph.figure.savefig(f"images/{ext}/{dataset}/{Template(params['filebase']).safe_substitute(madlibs)}.{ext}",
+                                 dpi=config['dpi'], bbox_inches="tight")
+
+        plt.close(graph.figure)
+        print(f"{Template(params['filebase']).safe_substitute(madlibs)}.{ext}")
+
+
 ###########################################
 
 
@@ -156,7 +256,7 @@ def main():
     colormappings = defaultdict(OrderedDict)
 
     database = sqlite3.connect(DATAFILE, detect_types=sqlite3.PARSE_DECLTYPES)
-    # cursor = database.cursor()
+
     '''
     for timeseries in config['timeseries']:
         params = config['timeseries_defaults'].copy()
@@ -180,23 +280,34 @@ def main():
         )
     '''
 
-    # sorry about this.
-    # what it does is: find all the rows from the peak
-    # week for each release.
-    query = """SELECT checkins.week,checkins.dataset,checkins.release,checkins.variant,checkins.arch,checkins.age,checkins.hits from checkins INNER JOIN
-                    (SELECT week,dataset,release,max(hits)
-                        FROM (SELECT week,dataset,release,sum(hits) AS hits 
-                        FROM  checkins
-                        GROUP BY week,dataset,release
-                        ORDER BY week) 
-                    GROUP BY dataset,release) AS peaks
-                    ON peaks.week = checkins.week AND peaks.release = checkins.release
-            """
-
-    at_peak = pd.read_sql_query(query, parse_dates='week',  con=database)
-    pd.set_option('display.max_rows', len(at_peak))
-    print(at_peak)
+    for byrelease in config['byrelease']:
+        params = config['byrelease_defaults'].copy()
+        params.update(byrelease)
+
+        query = f"""SELECT release, {params['dataseries']}, SUM(hits) as hits
+                FROM peak
+                WHERE dataset =\"{params['dataset']}\"
+                {params['extraselect']}
+                GROUP BY release, {params['dataseries']}
+                ORDER BY release
+              """
+        df = pd.read_sql_query(query, con=database)
+
+        graph_releasebars(
+            config=config,
+            colormappings=colormappings,
+            params=params,
+            dataframe=df.pivot(index='release', columns=params['dataseries'],
+                               values='hits').astype("Int64"),
+        )
+
 
+'''
+### getting ahead of myself: this is for the waffle charts
+query = """SELECT * from checkins where week >= (SELECT DISTINCT(week) FROM checkins ORDER BY week DESC LIMIT 4,1)"""
+df = pd.read_sql_query(query, parse_dates='week', con=database)
+df
+'''
 
 if __name__ == "__main__":
     main()
diff --git a/config.toml b/config.toml
index a6c5c58..4bcf55e 100644
--- a/config.toml
+++ b/config.toml
@@ -87,351 +87,420 @@ views=['line','stacked','share']
 extraselect="AND age=0"
 filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
 
-[[timeseries]]
-subtitle="persistent systems"
-dataset="fedora_updates_systems"
-dataseries="release"
-views=['line','stacked','share']
-extraselect="AND age>0"
-filebase="$dataset-timeseries-$dataseries-$view-persistent"
+# [[timeseries]]
+# subtitle="persistent systems"
+# dataset="fedora_updates_systems"
+# dataseries="release"
+# views=['line','stacked','share']
+# extraselect="AND age>0"
+# filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+# [[timeseries]]
+# dataset="fedora_updates_systems"
+# dataseries="age"
+# views=['share','stacked']
+# filebase="$dataset-timeseries-$dataseries-$view-all"
+
+# [[timeseries]]
+# dataset="fedora_updates_systems"
+# dataseries="arch"
+# views=['line','share']
+# filebase="$dataset-timeseries-$dataseries-$view-all"
+
+# [[timeseries]]
+# subtitle="ephemeral systems"
+# dataset="fedora_updates_systems"
+# dataseries="arch"
+# views=['line','share']
+# extraselect="AND age=0"
+# filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+# [[timeseries]]
+# subtitle="persistent systems"
+# dataset="fedora_updates_systems"
+# dataseries="arch"
+# views=['line','share']
+# extraselect="AND age>0"
+# filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+# [[timeseries]]
+# dataset="fedora_updates_systems"
+# dataseries="variant"
+# views=['line','share']
+# filebase="$dataset-timeseries-$dataseries-$view-all"
+
+# [[timeseries]]
+# subtitle="ephemeral systems"
+# dataset="fedora_updates_systems"
+# dataseries="variant"
+# views=['line','share']
+# extraselect="AND age=0"
+# filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+# [[timeseries]]
+# subtitle="persistent systems"
+# dataset="fedora_updates_systems"
+# dataseries="variant"
+# views=['line','share']
+# extraselect="AND age>0"
+# filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+# [[timeseries]]
+# dataset="fedora_updates_containers"
+# dataseries="release"
+# views=['line','stacked','share']
+# filebase="$dataset-timeseries-$dataseries-$view-all"
+
+# [[timeseries]]
+# subtitle="ephemeral systems"
+# dataset="fedora_updates_containers"
+# dataseries="release"
+# views=['line','stacked','share']
+# extraselect="AND age=0"
+# filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+# [[timeseries]]
+# subtitle="persistent systems"
+# dataset="fedora_updates_containers"
+# dataseries="release"
+# views=['line','stacked','share']
+# extraselect="AND age>0"
+# filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+# [[timeseries]]
+# dataset="fedora_updates_containers"
+# dataseries="age"
+# views=['share','stacked']
+# filebase="$dataset-timeseries-$dataseries-$view-all"
+
+# [[timeseries]]
+# dataset="fedora_updates_containers"
+# dataseries="arch"
+# views=['line','share']
+# filebase="$dataset-timeseries-$dataseries-$view-all"
+
+# [[timeseries]]
+# subtitle="ephemeral systems"
+# dataset="fedora_updates_containers"
+# dataseries="arch"
+# views=['line','share']
+# extraselect="AND age=0"
+# filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+# [[timeseries]]
+# subtitle="persistent systems"
+# dataset="fedora_updates_containers"
+# dataseries="arch"
+# views=['line','share']
+# extraselect="AND age>0"
+# filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+# [[timeseries]]
+# dataset="fedora_updates_containers"
+# dataseries="variant"
+# views=['line','share']
+# filebase="$dataset-timeseries-$dataseries-$view-all"
+
+# [[timeseries]]
+# subtitle="ephemeral systems"
+# dataset="fedora_updates_containers"
+# dataseries="variant"
+# views=['line','share']
+# extraselect="AND age=0"
+# filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+# [[timeseries]]
+# subtitle="persistent systems"
+# dataset="fedora_updates_containers"
+# dataseries="variant"
+# views=['line','share']
+# extraselect="AND age>0"
+# filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+
+# [[timeseries]]
+# dataset="fedora_rawhide_systems"
+# dataseries="release"
+# views=['line','stacked','share']
+# filebase="$dataset-timeseries-$dataseries-$view-all"
+
+# [[timeseries]]
+# subtitle="ephemeral systems"
+# dataset="fedora_rawhide_systems"
+# dataseries="release"
+# views=['line','stacked','share']
+# extraselect="AND age=0"
+# filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+# [[timeseries]]
+# subtitle="persistent systems"
+# dataset="fedora_rawhide_systems"
+# dataseries="release"
+# views=['line','stacked','share']
+# extraselect="AND age>0"
+# filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+# [[timeseries]]
+# dataset="fedora_rawhide_systems"
+# dataseries="age"
+# views=['share','stacked']
+
+# [[timeseries]]
+# dataset="fedora_rawhide_systems"
+# dataseries="arch"
+# views=['line','share']
+# filebase="$dataset-timeseries-$dataseries-$view-all"
+
+# [[timeseries]]
+# subtitle="ephemeral systems"
+# dataset="fedora_rawhide_systems"
+# dataseries="arch"
+# views=['line','share']
+# extraselect="AND age=0"
+# filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+# [[timeseries]]
+# subtitle="persistent systems"
+# dataset="fedora_rawhide_systems"
+# dataseries="arch"
+# views=['line','share']
+# extraselect="AND age>0"
+# filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+# [[timeseries]]
+# dataset="fedora_rawhide_systems"
+# dataseries="variant"
+# views=['line','share']
+# filebase="$dataset-timeseries-$dataseries-$view-all"
+
+# [[timeseries]]
+# subtitle="ephemeral systems"
+# dataset="fedora_rawhide_systems"
+# dataseries="variant"
+# views=['line','share']
+# extraselect="AND age=0"
+# filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+# [[timeseries]]
+# subtitle="persistent systems"
+# dataset="fedora_rawhide_systems"
+# dataseries="variant"
+# views=['line','share']
+# extraselect="AND age>0"
+# filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+
+# [[timeseries]]
+# dataset="fedora_rawhide_containers"
+# dataseries="release"
+# views=['line','stacked','share']
+# filebase="$dataset-timeseries-$dataseries-$view-all"
+
+# [[timeseries]]
+# subtitle="ephemeral systems"
+# dataset="fedora_rawhide_containers"
+# dataseries="release"
+# views=['line','stacked','share']
+# extraselect="AND age=0"
+# filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+# [[timeseries]]
+# subtitle="persistent systems"
+# dataset="fedora_rawhide_containers"
+# dataseries="release"
+# views=['line','stacked','share']
+# extraselect="AND age>0"
+# filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+# [[timeseries]]
+# dataset="fedora_rawhide_containers"
+# dataseries="age"
+# views=['share','stacked']
+# filebase="$dataset-timeseries-$dataseries-$view-all"
+
+# [[timeseries]]
+# dataset="fedora_rawhide_containers"
+# dataseries="arch"
+# views=['line','share']
+# filebase="$dataset-timeseries-$dataseries-$view-all"
+
+# [[timeseries]]
+# subtitle="ephemeral systems"
+# dataset="fedora_rawhide_containers"
+# dataseries="arch"
+# views=['line','share']
+# extraselect="AND age=0"
+# filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+# [[timeseries]]
+# subtitle="persistent systems"
+# dataset="fedora_rawhide_containers"
+# dataseries="arch"
+# views=['line','share']
+# extraselect="AND age>0"
+# filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+# [[timeseries]]
+# dataset="fedora_rawhide_containers"
+# dataseries="variant"
+# views=['line','share']
+# filebase="$dataset-timeseries-$dataseries-$view-all"
+
+# [[timeseries]]
+# subtitle="ephemeral systems"
+# dataset="fedora_rawhide_containers"
+# dataseries="variant"
+# views=['line','share']
+# extraselect="AND age=0"
+# filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+# [[timeseries]]
+# subtitle="persistent systems"
+# dataset="fedora_rawhide_containers"
+# dataseries="variant"
+# views=['line','share']
+# extraselect="AND age>0"
+# filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+
+# [[timeseries]]
+# dataset="epel"
+# dataseries="release"
+# views=['line','stacked','share']
+# filebase="$dataset-timeseries-$dataseries-$view-all"
+
+# [[timeseries]]
+# subtitle="ephemeral systems"
+# dataset="epel"
+# dataseries="release"
+# views=['line','stacked','share']
+# extraselect="AND age=0"
+# filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+# [[timeseries]]
+# subtitle="persistent systems"
+# dataset="epel"
+# dataseries="release"
+# views=['line','stacked','share']
+# extraselect="AND age>0"
+# filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+# [[timeseries]]
+# dataset="epel"
+# dataseries="age"
+# views=['share','stacked']
+# filebase="$dataset-timeseries-$dataseries-$view-all"
+
+# [[timeseries]]
+# dataset="epel"
+# dataseries="arch"
+# views=['line','share']
+# filebase="$dataset-timeseries-$dataseries-$view-all"
+
+# [[timeseries]]
+# subtitle="ephemeral systems"
+# dataset="epel"
+# dataseries="arch"
+# views=['line','share']
+# extraselect="AND age=0"
+# filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+# [[timeseries]]
+# subtitle="persistent systems"
+# dataset="epel"
+# dataseries="arch"
+# views=['line','share']
+# extraselect="AND age>0"
+# filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+# [[timeseries]]
+# dataset="epel"
+# dataseries="variant"
+# views=['line','share']
+# filebase="$dataset-timeseries-$dataseries-$view-all"
+
+# [[timeseries]]
+# subtitle="ephemeral systems"
+# dataset="epel"
+# dataseries="variant"
+# views=['line','share']
+# extraselect="AND age=0"
+# filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+# [[timeseries]]
+# subtitle="persistent systems"
+# dataset="epel"
+# dataseries="variant"
+# views=['line','share']
+# extraselect="AND age>0"
+# filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+[byrelease_defaults]
+title="$dataset_label: $dataseries_label by release"
+filebase="$dataset-byrelease-$dataseries-$view"
+extraselect=""
+views=['stacked','share']
 
-[[timeseries]]
+[[byrelease]]
 dataset="fedora_updates_systems"
 dataseries="age"
-views=['share','stacked']
-filebase="$dataset-timeseries-$dataseries-$view-all"
-
-[[timeseries]]
-dataset="fedora_updates_systems"
-dataseries="arch"
-views=['line','share']
-filebase="$dataset-timeseries-$dataseries-$view-all"
-
-[[timeseries]]
-subtitle="ephemeral systems"
-dataset="fedora_updates_systems"
-dataseries="arch"
-views=['line','share']
-extraselect="AND age=0"
-filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
 
-[[timeseries]]
-subtitle="persistent systems"
+[[byrelease]]
 dataset="fedora_updates_systems"
 dataseries="arch"
-views=['line','share']
-extraselect="AND age>0"
-filebase="$dataset-timeseries-$dataseries-$view-persistent"
 
-[[timeseries]]
+[[byrelease]]
 dataset="fedora_updates_systems"
 dataseries="variant"
-views=['line','share']
-filebase="$dataset-timeseries-$dataseries-$view-all"
 
-[[timeseries]]
-subtitle="ephemeral systems"
-dataset="fedora_updates_systems"
-dataseries="variant"
-views=['line','share']
-extraselect="AND age=0"
-filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
 
-[[timeseries]]
-subtitle="persistent systems"
-dataset="fedora_updates_systems"
-dataseries="variant"
-views=['line','share']
-extraselect="AND age>0"
-filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
-[[timeseries]]
-dataset="fedora_updates_containers"
-dataseries="release"
-views=['line','stacked','share']
-filebase="$dataset-timeseries-$dataseries-$view-all"
-
-[[timeseries]]
-subtitle="ephemeral systems"
-dataset="fedora_updates_containers"
-dataseries="release"
-views=['line','stacked','share']
-extraselect="AND age=0"
-filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
-
-[[timeseries]]
-subtitle="persistent systems"
-dataset="fedora_updates_containers"
-dataseries="release"
-views=['line','stacked','share']
-extraselect="AND age>0"
-filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
-[[timeseries]]
+[[byrelease]]
 dataset="fedora_updates_containers"
 dataseries="age"
-views=['share','stacked']
-filebase="$dataset-timeseries-$dataseries-$view-all"
-
-[[timeseries]]
-dataset="fedora_updates_containers"
-dataseries="arch"
-views=['line','share']
-filebase="$dataset-timeseries-$dataseries-$view-all"
-
-[[timeseries]]
-subtitle="ephemeral systems"
-dataset="fedora_updates_containers"
-dataseries="arch"
-views=['line','share']
-extraselect="AND age=0"
-filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
 
-[[timeseries]]
-subtitle="persistent systems"
+[[byrelease]]
 dataset="fedora_updates_containers"
 dataseries="arch"
-views=['line','share']
-extraselect="AND age>0"
-filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
-[[timeseries]]
-dataset="fedora_updates_containers"
-dataseries="variant"
-views=['line','share']
-filebase="$dataset-timeseries-$dataseries-$view-all"
-
-[[timeseries]]
-subtitle="ephemeral systems"
-dataset="fedora_updates_containers"
-dataseries="variant"
-views=['line','share']
-extraselect="AND age=0"
-filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
 
-[[timeseries]]
-subtitle="persistent systems"
+[[byrelease]]
 dataset="fedora_updates_containers"
 dataseries="variant"
-views=['line','share']
-extraselect="AND age>0"
-filebase="$dataset-timeseries-$dataseries-$view-persistent"
 
 
-[[timeseries]]
-dataset="fedora_rawhide_systems"
-dataseries="release"
-views=['line','stacked','share']
-filebase="$dataset-timeseries-$dataseries-$view-all"
-
-[[timeseries]]
-subtitle="ephemeral systems"
-dataset="fedora_rawhide_systems"
-dataseries="release"
-views=['line','stacked','share']
-extraselect="AND age=0"
-filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
-
-[[timeseries]]
-subtitle="persistent systems"
-dataset="fedora_rawhide_systems"
-dataseries="release"
-views=['line','stacked','share']
-extraselect="AND age>0"
-filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
-[[timeseries]]
+[[byrelease]]
 dataset="fedora_rawhide_systems"
 dataseries="age"
-views=['share','stacked']
-
-[[timeseries]]
-dataset="fedora_rawhide_systems"
-dataseries="arch"
-views=['line','share']
-filebase="$dataset-timeseries-$dataseries-$view-all"
-
-[[timeseries]]
-subtitle="ephemeral systems"
-dataset="fedora_rawhide_systems"
-dataseries="arch"
-views=['line','share']
-extraselect="AND age=0"
-filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
 
-[[timeseries]]
-subtitle="persistent systems"
+[[byrelease]]
 dataset="fedora_rawhide_systems"
 dataseries="arch"
-views=['line','share']
-extraselect="AND age>0"
-filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
-[[timeseries]]
-dataset="fedora_rawhide_systems"
-dataseries="variant"
-views=['line','share']
-filebase="$dataset-timeseries-$dataseries-$view-all"
-
-[[timeseries]]
-subtitle="ephemeral systems"
-dataset="fedora_rawhide_systems"
-dataseries="variant"
-views=['line','share']
-extraselect="AND age=0"
-filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
 
-[[timeseries]]
-subtitle="persistent systems"
+[[byrelease]]
 dataset="fedora_rawhide_systems"
 dataseries="variant"
-views=['line','share']
-extraselect="AND age>0"
-filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
 
-[[timeseries]]
-dataset="fedora_rawhide_containers"
-dataseries="release"
-views=['line','stacked','share']
-filebase="$dataset-timeseries-$dataseries-$view-all"
 
-[[timeseries]]
-subtitle="ephemeral systems"
-dataset="fedora_rawhide_containers"
-dataseries="release"
-views=['line','stacked','share']
-extraselect="AND age=0"
-filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
-
-[[timeseries]]
-subtitle="persistent systems"
-dataset="fedora_rawhide_containers"
-dataseries="release"
-views=['line','stacked','share']
-extraselect="AND age>0"
-filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
-[[timeseries]]
+[[byrelease]]
 dataset="fedora_rawhide_containers"
 dataseries="age"
-views=['share','stacked']
-filebase="$dataset-timeseries-$dataseries-$view-all"
 
-[[timeseries]]
+[[byrelease]]
 dataset="fedora_rawhide_containers"
 dataseries="arch"
-views=['line','share']
-filebase="$dataset-timeseries-$dataseries-$view-all"
-
-[[timeseries]]
-subtitle="ephemeral systems"
-dataset="fedora_rawhide_containers"
-dataseries="arch"
-views=['line','share']
-extraselect="AND age=0"
-filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
-
-[[timeseries]]
-subtitle="persistent systems"
-dataset="fedora_rawhide_containers"
-dataseries="arch"
-views=['line','share']
-extraselect="AND age>0"
-filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
-[[timeseries]]
-dataset="fedora_rawhide_containers"
-dataseries="variant"
-views=['line','share']
-filebase="$dataset-timeseries-$dataseries-$view-all"
-
-[[timeseries]]
-subtitle="ephemeral systems"
-dataset="fedora_rawhide_containers"
-dataseries="variant"
-views=['line','share']
-extraselect="AND age=0"
-filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
 
-[[timeseries]]
-subtitle="persistent systems"
+[[byrelease]]
 dataset="fedora_rawhide_containers"
 dataseries="variant"
-views=['line','share']
-extraselect="AND age>0"
-filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
-
-[[timeseries]]
-dataset="epel"
-dataseries="release"
-views=['line','stacked','share']
-filebase="$dataset-timeseries-$dataseries-$view-all"
 
-[[timeseries]]
-subtitle="ephemeral systems"
-dataset="epel"
-dataseries="release"
-views=['line','stacked','share']
-extraselect="AND age=0"
-filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
 
-[[timeseries]]
-subtitle="persistent systems"
-dataset="epel"
-dataseries="release"
-views=['line','stacked','share']
-extraselect="AND age>0"
-filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
-[[timeseries]]
+[[byrelease]]
 dataset="epel"
 dataseries="age"
-views=['share','stacked']
-filebase="$dataset-timeseries-$dataseries-$view-all"
 
-[[timeseries]]
-dataset="epel"
-dataseries="arch"
-views=['line','share']
-filebase="$dataset-timeseries-$dataseries-$view-all"
-
-[[timeseries]]
-subtitle="ephemeral systems"
+[[byrelease]]
 dataset="epel"
 dataseries="arch"
-views=['line','share']
-extraselect="AND age=0"
-filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
 
-[[timeseries]]
-subtitle="persistent systems"
-dataset="epel"
-dataseries="arch"
-views=['line','share']
-extraselect="AND age>0"
-filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
-[[timeseries]]
+[[byrelease]]
 dataset="epel"
 dataseries="variant"
-views=['line','share']
-filebase="$dataset-timeseries-$dataseries-$view-all"
-
-[[timeseries]]
-subtitle="ephemeral systems"
-dataset="epel"
-dataseries="variant"
-views=['line','share']
-extraselect="AND age=0"
-filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
-
-[[timeseries]]
-subtitle="persistent systems"
-dataset="epel"
-dataseries="variant"
-views=['line','share']
-extraselect="AND age>0"
-filebase="$dataset-timeseries-$dataseries-$view-persistent"
-

From c669134faa31a88244db6891c98edf409cb7c387 Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Feb 04 2022 13:44:33 +0000
Subject: [PATCH 4/49] reenable timeseries views. still wip here...


---

diff --git a/config.toml b/config.toml
index 4bcf55e..5bdcb80 100644
--- a/config.toml
+++ b/config.toml
@@ -87,353 +87,353 @@ views=['line','stacked','share']
 extraselect="AND age=0"
 filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
 
-# [[timeseries]]
-# subtitle="persistent systems"
-# dataset="fedora_updates_systems"
-# dataseries="release"
-# views=['line','stacked','share']
-# extraselect="AND age>0"
-# filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
-# [[timeseries]]
-# dataset="fedora_updates_systems"
-# dataseries="age"
-# views=['share','stacked']
-# filebase="$dataset-timeseries-$dataseries-$view-all"
-
-# [[timeseries]]
-# dataset="fedora_updates_systems"
-# dataseries="arch"
-# views=['line','share']
-# filebase="$dataset-timeseries-$dataseries-$view-all"
-
-# [[timeseries]]
-# subtitle="ephemeral systems"
-# dataset="fedora_updates_systems"
-# dataseries="arch"
-# views=['line','share']
-# extraselect="AND age=0"
-# filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
-
-# [[timeseries]]
-# subtitle="persistent systems"
-# dataset="fedora_updates_systems"
-# dataseries="arch"
-# views=['line','share']
-# extraselect="AND age>0"
-# filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
-# [[timeseries]]
-# dataset="fedora_updates_systems"
-# dataseries="variant"
-# views=['line','share']
-# filebase="$dataset-timeseries-$dataseries-$view-all"
-
-# [[timeseries]]
-# subtitle="ephemeral systems"
-# dataset="fedora_updates_systems"
-# dataseries="variant"
-# views=['line','share']
-# extraselect="AND age=0"
-# filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
-
-# [[timeseries]]
-# subtitle="persistent systems"
-# dataset="fedora_updates_systems"
-# dataseries="variant"
-# views=['line','share']
-# extraselect="AND age>0"
-# filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
-# [[timeseries]]
-# dataset="fedora_updates_containers"
-# dataseries="release"
-# views=['line','stacked','share']
-# filebase="$dataset-timeseries-$dataseries-$view-all"
-
-# [[timeseries]]
-# subtitle="ephemeral systems"
-# dataset="fedora_updates_containers"
-# dataseries="release"
-# views=['line','stacked','share']
-# extraselect="AND age=0"
-# filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
-
-# [[timeseries]]
-# subtitle="persistent systems"
-# dataset="fedora_updates_containers"
-# dataseries="release"
-# views=['line','stacked','share']
-# extraselect="AND age>0"
-# filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
-# [[timeseries]]
-# dataset="fedora_updates_containers"
-# dataseries="age"
-# views=['share','stacked']
-# filebase="$dataset-timeseries-$dataseries-$view-all"
-
-# [[timeseries]]
-# dataset="fedora_updates_containers"
-# dataseries="arch"
-# views=['line','share']
-# filebase="$dataset-timeseries-$dataseries-$view-all"
-
-# [[timeseries]]
-# subtitle="ephemeral systems"
-# dataset="fedora_updates_containers"
-# dataseries="arch"
-# views=['line','share']
-# extraselect="AND age=0"
-# filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
-
-# [[timeseries]]
-# subtitle="persistent systems"
-# dataset="fedora_updates_containers"
-# dataseries="arch"
-# views=['line','share']
-# extraselect="AND age>0"
-# filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
-# [[timeseries]]
-# dataset="fedora_updates_containers"
-# dataseries="variant"
-# views=['line','share']
-# filebase="$dataset-timeseries-$dataseries-$view-all"
-
-# [[timeseries]]
-# subtitle="ephemeral systems"
-# dataset="fedora_updates_containers"
-# dataseries="variant"
-# views=['line','share']
-# extraselect="AND age=0"
-# filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
-
-# [[timeseries]]
-# subtitle="persistent systems"
-# dataset="fedora_updates_containers"
-# dataseries="variant"
-# views=['line','share']
-# extraselect="AND age>0"
-# filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
-
-# [[timeseries]]
-# dataset="fedora_rawhide_systems"
-# dataseries="release"
-# views=['line','stacked','share']
-# filebase="$dataset-timeseries-$dataseries-$view-all"
-
-# [[timeseries]]
-# subtitle="ephemeral systems"
-# dataset="fedora_rawhide_systems"
-# dataseries="release"
-# views=['line','stacked','share']
-# extraselect="AND age=0"
-# filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
-
-# [[timeseries]]
-# subtitle="persistent systems"
-# dataset="fedora_rawhide_systems"
-# dataseries="release"
-# views=['line','stacked','share']
-# extraselect="AND age>0"
-# filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
-# [[timeseries]]
-# dataset="fedora_rawhide_systems"
-# dataseries="age"
-# views=['share','stacked']
-
-# [[timeseries]]
-# dataset="fedora_rawhide_systems"
-# dataseries="arch"
-# views=['line','share']
-# filebase="$dataset-timeseries-$dataseries-$view-all"
-
-# [[timeseries]]
-# subtitle="ephemeral systems"
-# dataset="fedora_rawhide_systems"
-# dataseries="arch"
-# views=['line','share']
-# extraselect="AND age=0"
-# filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
-
-# [[timeseries]]
-# subtitle="persistent systems"
-# dataset="fedora_rawhide_systems"
-# dataseries="arch"
-# views=['line','share']
-# extraselect="AND age>0"
-# filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
-# [[timeseries]]
-# dataset="fedora_rawhide_systems"
-# dataseries="variant"
-# views=['line','share']
-# filebase="$dataset-timeseries-$dataseries-$view-all"
-
-# [[timeseries]]
-# subtitle="ephemeral systems"
-# dataset="fedora_rawhide_systems"
-# dataseries="variant"
-# views=['line','share']
-# extraselect="AND age=0"
-# filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
-
-# [[timeseries]]
-# subtitle="persistent systems"
-# dataset="fedora_rawhide_systems"
-# dataseries="variant"
-# views=['line','share']
-# extraselect="AND age>0"
-# filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
-
-# [[timeseries]]
-# dataset="fedora_rawhide_containers"
-# dataseries="release"
-# views=['line','stacked','share']
-# filebase="$dataset-timeseries-$dataseries-$view-all"
-
-# [[timeseries]]
-# subtitle="ephemeral systems"
-# dataset="fedora_rawhide_containers"
-# dataseries="release"
-# views=['line','stacked','share']
-# extraselect="AND age=0"
-# filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
-
-# [[timeseries]]
-# subtitle="persistent systems"
-# dataset="fedora_rawhide_containers"
-# dataseries="release"
-# views=['line','stacked','share']
-# extraselect="AND age>0"
-# filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
-# [[timeseries]]
-# dataset="fedora_rawhide_containers"
-# dataseries="age"
-# views=['share','stacked']
-# filebase="$dataset-timeseries-$dataseries-$view-all"
-
-# [[timeseries]]
-# dataset="fedora_rawhide_containers"
-# dataseries="arch"
-# views=['line','share']
-# filebase="$dataset-timeseries-$dataseries-$view-all"
-
-# [[timeseries]]
-# subtitle="ephemeral systems"
-# dataset="fedora_rawhide_containers"
-# dataseries="arch"
-# views=['line','share']
-# extraselect="AND age=0"
-# filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
-
-# [[timeseries]]
-# subtitle="persistent systems"
-# dataset="fedora_rawhide_containers"
-# dataseries="arch"
-# views=['line','share']
-# extraselect="AND age>0"
-# filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
-# [[timeseries]]
-# dataset="fedora_rawhide_containers"
-# dataseries="variant"
-# views=['line','share']
-# filebase="$dataset-timeseries-$dataseries-$view-all"
-
-# [[timeseries]]
-# subtitle="ephemeral systems"
-# dataset="fedora_rawhide_containers"
-# dataseries="variant"
-# views=['line','share']
-# extraselect="AND age=0"
-# filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
-
-# [[timeseries]]
-# subtitle="persistent systems"
-# dataset="fedora_rawhide_containers"
-# dataseries="variant"
-# views=['line','share']
-# extraselect="AND age>0"
-# filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
-
-# [[timeseries]]
-# dataset="epel"
-# dataseries="release"
-# views=['line','stacked','share']
-# filebase="$dataset-timeseries-$dataseries-$view-all"
-
-# [[timeseries]]
-# subtitle="ephemeral systems"
-# dataset="epel"
-# dataseries="release"
-# views=['line','stacked','share']
-# extraselect="AND age=0"
-# filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
-
-# [[timeseries]]
-# subtitle="persistent systems"
-# dataset="epel"
-# dataseries="release"
-# views=['line','stacked','share']
-# extraselect="AND age>0"
-# filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
-# [[timeseries]]
-# dataset="epel"
-# dataseries="age"
-# views=['share','stacked']
-# filebase="$dataset-timeseries-$dataseries-$view-all"
-
-# [[timeseries]]
-# dataset="epel"
-# dataseries="arch"
-# views=['line','share']
-# filebase="$dataset-timeseries-$dataseries-$view-all"
-
-# [[timeseries]]
-# subtitle="ephemeral systems"
-# dataset="epel"
-# dataseries="arch"
-# views=['line','share']
-# extraselect="AND age=0"
-# filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
-
-# [[timeseries]]
-# subtitle="persistent systems"
-# dataset="epel"
-# dataseries="arch"
-# views=['line','share']
-# extraselect="AND age>0"
-# filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
-# [[timeseries]]
-# dataset="epel"
-# dataseries="variant"
-# views=['line','share']
-# filebase="$dataset-timeseries-$dataseries-$view-all"
-
-# [[timeseries]]
-# subtitle="ephemeral systems"
-# dataset="epel"
-# dataseries="variant"
-# views=['line','share']
-# extraselect="AND age=0"
-# filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
-
-# [[timeseries]]
-# subtitle="persistent systems"
-# dataset="epel"
-# dataseries="variant"
-# views=['line','share']
-# extraselect="AND age>0"
-# filebase="$dataset-timeseries-$dataseries-$view-persistent"
+[[timeseries]]
+subtitle="persistent systems"
+dataset="fedora_updates_systems"
+dataseries="release"
+views=['line','stacked','share']
+extraselect="AND age>0"
+filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+[[timeseries]]
+dataset="fedora_updates_systems"
+dataseries="age"
+views=['share','stacked']
+filebase="$dataset-timeseries-$dataseries-$view-all"
+
+[[timeseries]]
+dataset="fedora_updates_systems"
+dataseries="arch"
+views=['line','share']
+filebase="$dataset-timeseries-$dataseries-$view-all"
+
+[[timeseries]]
+subtitle="ephemeral systems"
+dataset="fedora_updates_systems"
+dataseries="arch"
+views=['line','share']
+extraselect="AND age=0"
+filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+[[timeseries]]
+subtitle="persistent systems"
+dataset="fedora_updates_systems"
+dataseries="arch"
+views=['line','share']
+extraselect="AND age>0"
+filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+[[timeseries]]
+dataset="fedora_updates_systems"
+dataseries="variant"
+views=['line','share']
+filebase="$dataset-timeseries-$dataseries-$view-all"
+
+[[timeseries]]
+subtitle="ephemeral systems"
+dataset="fedora_updates_systems"
+dataseries="variant"
+views=['line','share']
+extraselect="AND age=0"
+filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+[[timeseries]]
+subtitle="persistent systems"
+dataset="fedora_updates_systems"
+dataseries="variant"
+views=['line','share']
+extraselect="AND age>0"
+filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+[[timeseries]]
+dataset="fedora_updates_containers"
+dataseries="release"
+views=['line','stacked','share']
+filebase="$dataset-timeseries-$dataseries-$view-all"
+
+[[timeseries]]
+subtitle="ephemeral systems"
+dataset="fedora_updates_containers"
+dataseries="release"
+views=['line','stacked','share']
+extraselect="AND age=0"
+filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+[[timeseries]]
+subtitle="persistent systems"
+dataset="fedora_updates_containers"
+dataseries="release"
+views=['line','stacked','share']
+extraselect="AND age>0"
+filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+[[timeseries]]
+dataset="fedora_updates_containers"
+dataseries="age"
+views=['share','stacked']
+filebase="$dataset-timeseries-$dataseries-$view-all"
+
+[[timeseries]]
+dataset="fedora_updates_containers"
+dataseries="arch"
+views=['line','share']
+filebase="$dataset-timeseries-$dataseries-$view-all"
+
+[[timeseries]]
+subtitle="ephemeral systems"
+dataset="fedora_updates_containers"
+dataseries="arch"
+views=['line','share']
+extraselect="AND age=0"
+filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+[[timeseries]]
+subtitle="persistent systems"
+dataset="fedora_updates_containers"
+dataseries="arch"
+views=['line','share']
+extraselect="AND age>0"
+filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+[[timeseries]]
+dataset="fedora_updates_containers"
+dataseries="variant"
+views=['line','share']
+filebase="$dataset-timeseries-$dataseries-$view-all"
+
+[[timeseries]]
+subtitle="ephemeral systems"
+dataset="fedora_updates_containers"
+dataseries="variant"
+views=['line','share']
+extraselect="AND age=0"
+filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+[[timeseries]]
+subtitle="persistent systems"
+dataset="fedora_updates_containers"
+dataseries="variant"
+views=['line','share']
+extraselect="AND age>0"
+filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+
+[[timeseries]]
+dataset="fedora_rawhide_systems"
+dataseries="release"
+views=['line','stacked','share']
+filebase="$dataset-timeseries-$dataseries-$view-all"
+
+[[timeseries]]
+subtitle="ephemeral systems"
+dataset="fedora_rawhide_systems"
+dataseries="release"
+views=['line','stacked','share']
+extraselect="AND age=0"
+filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+[[timeseries]]
+subtitle="persistent systems"
+dataset="fedora_rawhide_systems"
+dataseries="release"
+views=['line','stacked','share']
+extraselect="AND age>0"
+filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+[[timeseries]]
+dataset="fedora_rawhide_systems"
+dataseries="age"
+views=['share','stacked']
+
+[[timeseries]]
+dataset="fedora_rawhide_systems"
+dataseries="arch"
+views=['line','share']
+filebase="$dataset-timeseries-$dataseries-$view-all"
+
+[[timeseries]]
+subtitle="ephemeral systems"
+dataset="fedora_rawhide_systems"
+dataseries="arch"
+views=['line','share']
+extraselect="AND age=0"
+filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+[[timeseries]]
+subtitle="persistent systems"
+dataset="fedora_rawhide_systems"
+dataseries="arch"
+views=['line','share']
+extraselect="AND age>0"
+filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+[[timeseries]]
+dataset="fedora_rawhide_systems"
+dataseries="variant"
+views=['line','share']
+filebase="$dataset-timeseries-$dataseries-$view-all"
+
+[[timeseries]]
+subtitle="ephemeral systems"
+dataset="fedora_rawhide_systems"
+dataseries="variant"
+views=['line','share']
+extraselect="AND age=0"
+filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+[[timeseries]]
+subtitle="persistent systems"
+dataset="fedora_rawhide_systems"
+dataseries="variant"
+views=['line','share']
+extraselect="AND age>0"
+filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+
+[[timeseries]]
+dataset="fedora_rawhide_containers"
+dataseries="release"
+views=['line','stacked','share']
+filebase="$dataset-timeseries-$dataseries-$view-all"
+
+[[timeseries]]
+subtitle="ephemeral systems"
+dataset="fedora_rawhide_containers"
+dataseries="release"
+views=['line','stacked','share']
+extraselect="AND age=0"
+filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+[[timeseries]]
+subtitle="persistent systems"
+dataset="fedora_rawhide_containers"
+dataseries="release"
+views=['line','stacked','share']
+extraselect="AND age>0"
+filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+[[timeseries]]
+dataset="fedora_rawhide_containers"
+dataseries="age"
+views=['share','stacked']
+filebase="$dataset-timeseries-$dataseries-$view-all"
+
+[[timeseries]]
+dataset="fedora_rawhide_containers"
+dataseries="arch"
+views=['line','share']
+filebase="$dataset-timeseries-$dataseries-$view-all"
+
+[[timeseries]]
+subtitle="ephemeral systems"
+dataset="fedora_rawhide_containers"
+dataseries="arch"
+views=['line','share']
+extraselect="AND age=0"
+filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+[[timeseries]]
+subtitle="persistent systems"
+dataset="fedora_rawhide_containers"
+dataseries="arch"
+views=['line','share']
+extraselect="AND age>0"
+filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+[[timeseries]]
+dataset="fedora_rawhide_containers"
+dataseries="variant"
+views=['line','share']
+filebase="$dataset-timeseries-$dataseries-$view-all"
+
+[[timeseries]]
+subtitle="ephemeral systems"
+dataset="fedora_rawhide_containers"
+dataseries="variant"
+views=['line','share']
+extraselect="AND age=0"
+filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+[[timeseries]]
+subtitle="persistent systems"
+dataset="fedora_rawhide_containers"
+dataseries="variant"
+views=['line','share']
+extraselect="AND age>0"
+filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+
+[[timeseries]]
+dataset="epel"
+dataseries="release"
+views=['line','stacked','share']
+filebase="$dataset-timeseries-$dataseries-$view-all"
+
+[[timeseries]]
+subtitle="ephemeral systems"
+dataset="epel"
+dataseries="release"
+views=['line','stacked','share']
+extraselect="AND age=0"
+filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+[[timeseries]]
+subtitle="persistent systems"
+dataset="epel"
+dataseries="release"
+views=['line','stacked','share']
+extraselect="AND age>0"
+filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+[[timeseries]]
+dataset="epel"
+dataseries="age"
+views=['share','stacked']
+filebase="$dataset-timeseries-$dataseries-$view-all"
+
+[[timeseries]]
+dataset="epel"
+dataseries="arch"
+views=['line','share']
+filebase="$dataset-timeseries-$dataseries-$view-all"
+
+[[timeseries]]
+subtitle="ephemeral systems"
+dataset="epel"
+dataseries="arch"
+views=['line','share']
+extraselect="AND age=0"
+filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+[[timeseries]]
+subtitle="persistent systems"
+dataset="epel"
+dataseries="arch"
+views=['line','share']
+extraselect="AND age>0"
+filebase="$dataset-timeseries-$dataseries-$view-persistent"
+
+[[timeseries]]
+dataset="epel"
+dataseries="variant"
+views=['line','share']
+filebase="$dataset-timeseries-$dataseries-$view-all"
+
+[[timeseries]]
+subtitle="ephemeral systems"
+dataset="epel"
+dataseries="variant"
+views=['line','share']
+extraselect="AND age=0"
+filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+[[timeseries]]
+subtitle="persistent systems"
+dataset="epel"
+dataseries="variant"
+views=['line','share']
+extraselect="AND age>0"
+filebase="$dataset-timeseries-$dataseries-$view-persistent"
 
 [byrelease_defaults]
 title="$dataset_label: $dataseries_label by release"

From 16b06b5efd7e815003ef92025b0318789a525f3b Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Feb 04 2022 16:18:05 +0000
Subject: [PATCH 5/49] "actually"


---

diff --git a/brontosaurus-plotter.py b/brontosaurus-plotter.py
index 5248b6c..060a83c 100755
--- a/brontosaurus-plotter.py
+++ b/brontosaurus-plotter.py
@@ -257,7 +257,6 @@ def main():
 
     database = sqlite3.connect(DATAFILE, detect_types=sqlite3.PARSE_DECLTYPES)
 
-    '''
     for timeseries in config['timeseries']:
         params = config['timeseries_defaults'].copy()
         params.update(timeseries)
@@ -278,7 +277,6 @@ def main():
             dataframe=df.pivot(index='week', columns=params['dataseries'],
                                values='hits').astype("Int64"),
         )
-    '''
 
     for byrelease in config['byrelease']:
         params = config['byrelease_defaults'].copy()

From c6874ad1d155c92e9c7ba3c987e3c83a927e0671 Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Feb 04 2022 17:19:52 +0000
Subject: [PATCH 6/49] subtitle note


---

diff --git a/config.toml b/config.toml
index 5bdcb80..cf42e07 100644
--- a/config.toml
+++ b/config.toml
@@ -437,6 +437,7 @@ filebase="$dataset-timeseries-$dataseries-$view-persistent"
 
 [byrelease_defaults]
 title="$dataset_label: $dataseries_label by release"
+subtitle="data for each release taken from the week of that release's peak"
 filebase="$dataset-byrelease-$dataseries-$view"
 extraselect=""
 views=['stacked','share']

From b6e465eb034dd71a30d19dd8a024d11b009bda17 Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Mar 03 2022 14:58:25 +0000
Subject: [PATCH 7/49] note on epel 8 graphs


---

diff --git a/TODO.md b/TODO.md
index 018c7f8..4135926 100644
--- a/TODO.md
+++ b/TODO.md
@@ -1,3 +1,6 @@
+* epel -- need to special-case EL 8 by-release graphs to add peak _after_
+  CentOS Linux 8 EOL
+
 * text reports!!!
   * this week / last week / year-over-year
      * total systems / total persistent / total ephemeral (+%)

From 8f5e0dcbacade800b6fc249212d732adcd76ea04 Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: May 26 2022 14:26:46 +0000
Subject: [PATCH 8/49] change start date for Fedora stats to January 2021 because

apparent "increase" before that is just DNF Countme enablement ramp up

---

diff --git a/TODO.md b/TODO.md
index 4135926..48e8f16 100644
--- a/TODO.md
+++ b/TODO.md
@@ -87,12 +87,6 @@
 
 * skip waffle charts that will never be interesting
 
-* once we have more than a year of data, start Fedora chart at 2021-01-01,
-  same as epel, because that initial growth curve is not really
-  representative of anything but upgrades and all the initial data
-  therefore skewed
-
-
 * Change ./run.sh into a makefile, because old-school.
 
 * Can we get anything interesting for "which variants tend to persist after EOL"?
diff --git a/brontosaurus-plotter.py b/brontosaurus-plotter.py
index 060a83c..f569540 100755
--- a/brontosaurus-plotter.py
+++ b/brontosaurus-plotter.py
@@ -177,6 +177,7 @@ def graph_releasebars(config, colormappings, params, dataframe):
 
     # FIXME: this is ugly
     startdate = config['startdate'][dataset.split('_', 1)[0]]
+    startrelease = config['startrelease'][dataset.split('_', 1)[0]]
 
     for view in params['views']:
 
diff --git a/brontosaurus-washer.sh b/brontosaurus-washer.sh
index 8dad9ef..b655cb7 100755
--- a/brontosaurus-washer.sh
+++ b/brontosaurus-washer.sh
@@ -31,11 +31,9 @@ EOF
 
 
 # While some test systems ran Fedora Linux 31, the feature landed
-# in 32 (released 2020-04-27, so drop all the old stuff. Note
-# that the Plan is to change this to the same as STARTDAY, and
-# actually backfill even for F32 with velociraptorizer data. 
+# in 32 (released 2020-04-27, so drop all the old stuff.
 FEDORA_STARTVER=32
-FEDORA_STARTDAY='2020-04-27'
+FEDORA_STARTDAY='2021-01-01'
 # And same for EPEL, with EL 8 (DNF feature launched in 8.3 at end of 2020)
 EPEL_STARTVER=8
 EPEL_STARTDAY='2021-01-01'
diff --git a/config.toml b/config.toml
index cf42e07..cf4308a 100644
--- a/config.toml
+++ b/config.toml
@@ -30,9 +30,8 @@ colors = [
 # TODO: not yet implemented
 image_types = ["png"]
 
-# todo: change the fedora start date to also 2021-01-01, after DevConf.cz 2022.
 [startdate]
-fedora = '2020-04-27' # week of Fedora 32 release
+fedora = '2021-01-01' # F32 release not fully captured, so start here.
 epel   = '2021-01-01' # DNF feature launched in 8.3 at end of 2020
 
 [dataset_labels]

From daa17bcd3f6d8096e6eb7566450795b227f6660b Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: May 31 2022 15:10:31 +0000
Subject: [PATCH 9/49] todo note :)


---

diff --git a/TODO.md b/TODO.md
index 48e8f16..24222e4 100644
--- a/TODO.md
+++ b/TODO.md
@@ -34,6 +34,8 @@
     * arch variants:
       * desktop,server
 
+* Report estimating new installs vs upgrades (number of systems older than
+  the release itself ... need to factor in beta releaes date, etc....)
 
 * I guess we should make it so the timeseries definitions can loop over multiple datasets to avoid
   a lot of redundancy. Or at least, to apply to all Fedora datasets? (Yes, that: introduce a "distro" grouping.)

From bfff4fe72d066be191b2365dd2c4d286e3b7e57a Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 19 2022 16:11:32 +0000
Subject: [PATCH 10/49] stacked bar graphs work now


---

diff --git a/brontosaurus-plotter.py b/brontosaurus-plotter.py
index f569540..e048f62 100755
--- a/brontosaurus-plotter.py
+++ b/brontosaurus-plotter.py
@@ -148,7 +148,7 @@ def graph_timeseries(config, colormappings, params, dataframe):
 
 
 def graph_releasebars(config, colormappings, params, dataframe):
-    """Draws earch release in the set as a bar chart"""
+    """Draws each release in the set as a bar chart"""
 
     # If we find we have missing data, in the future:
     # dataframe.resample('W-MON')
@@ -177,7 +177,6 @@ def graph_releasebars(config, colormappings, params, dataframe):
 
     # FIXME: this is ugly
     startdate = config['startdate'][dataset.split('_', 1)[0]]
-    startrelease = config['startrelease'][dataset.split('_', 1)[0]]
 
     for view in params['views']:
 
@@ -192,9 +191,9 @@ def graph_releasebars(config, colormappings, params, dataframe):
                 kind = 'bar'
                 colormap = cmap
 
-                # Start the actual graph
+        # Start the actual graph
         graph = df.plot(figsize=config['figsize'],
-                        colormap=colormap, kind=kind)
+                        colormap=colormap, kind=kind, stacked=True)
 
         # Labels and titles and stuff.
         ax = plt.gca()
@@ -258,47 +257,49 @@ def main():
 
     database = sqlite3.connect(DATAFILE, detect_types=sqlite3.PARSE_DECLTYPES)
 
-    for timeseries in config['timeseries']:
-        params = config['timeseries_defaults'].copy()
-        params.update(timeseries)
-
-        query = f"""SELECT week, {params['dataseries']}, SUM(hits) as hits
-                FROM checkins
-                WHERE dataset =\"{params['dataset']}\"
-                {params['extraselect']}
-                GROUP BY week, {params['dataseries']}
-                ORDER BY week
-              """
-        df = pd.read_sql_query(query, parse_dates='week',   con=database)
-
-        graph_timeseries(
-            config=config,
-            colormappings=colormappings,
-            params=params,
-            dataframe=df.pivot(index='week', columns=params['dataseries'],
-                               values='hits').astype("Int64"),
-        )
-
-    for byrelease in config['byrelease']:
-        params = config['byrelease_defaults'].copy()
-        params.update(byrelease)
-
-        query = f"""SELECT release, {params['dataseries']}, SUM(hits) as hits
-                FROM peak
-                WHERE dataset =\"{params['dataset']}\"
-                {params['extraselect']}
-                GROUP BY release, {params['dataseries']}
-                ORDER BY release
-              """
-        df = pd.read_sql_query(query, con=database)
-
-        graph_releasebars(
-            config=config,
-            colormappings=colormappings,
-            params=params,
-            dataframe=df.pivot(index='release', columns=params['dataseries'],
-                               values='hits').astype("Int64"),
-        )
+    if 'timeseries' in config:
+        for timeseries in config['timeseries']:
+            params = config['timeseries_defaults'].copy()
+            params.update(timeseries)
+
+            query = f"""SELECT week, {params['dataseries']}, SUM(hits) as hits
+                    FROM checkins
+                    WHERE dataset =\"{params['dataset']}\"
+                    {params['extraselect']}
+                    GROUP BY week, {params['dataseries']}
+                    ORDER BY week
+                """
+            df = pd.read_sql_query(query, parse_dates='week',   con=database)
+
+            graph_timeseries(
+                config=config,
+                colormappings=colormappings,
+                params=params,
+                dataframe=df.pivot(index='week', columns=params['dataseries'],
+                                values='hits').astype("Int64"),
+            )
+
+    if 'byrelease' in config:
+        for byrelease in config['byrelease']:
+            params = config['byrelease_defaults'].copy()
+            params.update(byrelease)
+
+            query = f"""SELECT release, {params['dataseries']}, SUM(hits) as hits
+                    FROM peak
+                    WHERE dataset =\"{params['dataset']}\"
+                    {params['extraselect']}
+                    GROUP BY release, {params['dataseries']}
+                    ORDER BY release
+                """
+            df = pd.read_sql_query(query, con=database)
+
+            graph_releasebars(
+                config=config,
+                colormappings=colormappings,
+                params=params,
+                dataframe=df.pivot(index='release', columns=params['dataseries'],
+                                values='hits').astype("Int64"),
+            )
 
 
 '''
diff --git a/config.toml b/config.toml
index cf4308a..1da7e54 100644
--- a/config.toml
+++ b/config.toml
@@ -436,7 +436,7 @@ filebase="$dataset-timeseries-$dataseries-$view-persistent"
 
 [byrelease_defaults]
 title="$dataset_label: $dataseries_label by release"
-subtitle="data for each release taken from the week of that release's peak"
+subtitle="data for each release taken from the week of that release's (current) peak"
 filebase="$dataset-byrelease-$dataseries-$view"
 extraselect=""
 views=['stacked','share']
@@ -448,11 +448,35 @@ dataseries="age"
 [[byrelease]]
 dataset="fedora_updates_systems"
 dataseries="arch"
+filebase="$dataset-timeseries-$dataseries-$view-all"
+
+[[byrelease]]
+dataset="fedora_updates_systems"
+dataseries="arch"
+extraselect="AND age=0"
+filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+[[byrelease]]
+dataset="fedora_updates_systems"
+dataseries="arch"
+extraselect="AND age>0"
+filebase="$dataset-timeseries-$dataseries-$view-persistent"
 
 [[byrelease]]
 dataset="fedora_updates_systems"
 dataseries="variant"
 
+[[byrelease]]
+dataset="fedora_updates_systems"
+dataseries="variant"
+extraselect="AND age=0"
+filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+
+[[byrelease]]
+dataset="fedora_updates_systems"
+dataseries="variant"
+extraselect="AND age>0"
+filebase="$dataset-timeseries-$dataseries-$view-persistent"
 
 [[byrelease]]
 dataset="fedora_updates_containers"
@@ -501,6 +525,7 @@ dataseries="age"
 dataset="epel"
 dataseries="arch"
 
+
 [[byrelease]]
 dataset="epel"
 dataseries="variant"

From bb3331f97903bd226f6e068e68674f2d93cce322 Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 19 2022 16:21:30 +0000
Subject: [PATCH 11/49] generic


---

diff --git a/TODO.md b/TODO.md
index 24222e4..72eed3a 100644
--- a/TODO.md
+++ b/TODO.md
@@ -111,7 +111,7 @@
 * fix the code in brotosaurus washer to merge '' to 'none' rather than just
   renaming (works now because there are no natural 'none' entries).
 
-* map "unknown" to "generic"
+* map "generic" and "unknown" and "none" to "unspecified"
 
 * instead of throwing away entries in the washing phase (especially those
   below thresholds), write them to a special db for "fun" analysis

From 406764a117e1b37ffb5da030756f7c90e0eea658 Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 19 2022 20:47:04 +0000
Subject: [PATCH 12/49] NEW PLAN


---

diff --git a/TODO.md b/TODO.md
index 72eed3a..6a891c4 100644
--- a/TODO.md
+++ b/TODO.md
@@ -1,3 +1,16 @@
+* new plan!
+
+ 1. Save the color mappings to a file as a separate step
+    * using defaults from config
+ 2. change brontosaurus-plotter to render _one_ image per call
+    * with a syntax for what to include or exclude by name
+    * and possibly with some number options?
+ 3. make timeline, releasebar, and waffle be separate commands
+ 4. have some script that pre-renders some defaults
+ 5. and a simple front-end for exploring the rest
+
+* put the dataset date in the filename!
+
 * epel -- need to special-case EL 8 by-release graphs to add peak _after_
   CentOS Linux 8 EOL
 

From a76eacd736b445f5af9492df98e7fefd4288d405 Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 22 2022 15:23:16 +0000
Subject: [PATCH 13/49] oops filenames


---

diff --git a/config.toml b/config.toml
index 1da7e54..fb53367 100644
--- a/config.toml
+++ b/config.toml
@@ -448,19 +448,19 @@ dataseries="age"
 [[byrelease]]
 dataset="fedora_updates_systems"
 dataseries="arch"
-filebase="$dataset-timeseries-$dataseries-$view-all"
+filebase="$dataset-byrelease-$dataseries-$view-all"
 
 [[byrelease]]
 dataset="fedora_updates_systems"
 dataseries="arch"
 extraselect="AND age=0"
-filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+filebase="$dataset-byrelease-$dataseries-$view-ephemeral"
 
 [[byrelease]]
 dataset="fedora_updates_systems"
 dataseries="arch"
 extraselect="AND age>0"
-filebase="$dataset-timeseries-$dataseries-$view-persistent"
+filebase="$dataset-byrelease-$dataseries-$view-persistent"
 
 [[byrelease]]
 dataset="fedora_updates_systems"
@@ -470,13 +470,13 @@ dataseries="variant"
 dataset="fedora_updates_systems"
 dataseries="variant"
 extraselect="AND age=0"
-filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
+filebase="$dataset-byrelease-$dataseries-$view-ephemeral"
 
 [[byrelease]]
 dataset="fedora_updates_systems"
 dataseries="variant"
 extraselect="AND age>0"
-filebase="$dataset-timeseries-$dataseries-$view-persistent"
+filebase="$dataset-byrelease-$dataseries-$view-persistent"
 
 [[byrelease]]
 dataset="fedora_updates_containers"

From 3bbff3d4ab55faf74e6bdc0f9fbf2a96f13b152a Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 23 2022 21:53:20 +0000
Subject: [PATCH 14/49] separate out color caching


---

diff --git a/.gitignore b/.gitignore
index 50d0c76..c2165cb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,4 +2,5 @@
 **/.~lock*#
 db/*
 images/*
-.ipynb_checkpoints/*
\ No newline at end of file
+.ipynb_checkpoints/*
+__pycache__
\ No newline at end of file
diff --git a/TODO.md b/TODO.md
index 6a891c4..533cc45 100644
--- a/TODO.md
+++ b/TODO.md
@@ -1,7 +1,7 @@
 * new plan!
 
  1. Save the color mappings to a file as a separate step
-    * using defaults from config
+    * using defaults from config (DONE)
  2. change brontosaurus-plotter to render _one_ image per call
     * with a syntax for what to include or exclude by name
     * and possibly with some number options?
diff --git a/brontosaurus-plotter.py b/brontosaurus-plotter.py
index e048f62..29ca465 100755
--- a/brontosaurus-plotter.py
+++ b/brontosaurus-plotter.py
@@ -6,9 +6,6 @@ import matplotlib.pyplot as plt
 import sqlite3
 from string import Template
 
-from collections import defaultdict
-from collections import OrderedDict
-
 from pprint import pprint
 
 import pandas as pd
@@ -16,6 +13,8 @@ import toml
 
 import matplotlib as m
 
+from brontosaurus_colorizer import load_color_cache, get_colors
+
 DATAFILE = 'db/bronto.db'
 
 m.use("Agg")
@@ -26,21 +25,6 @@ m.rcParams['font.family'] = 'Montserrat'
 m.rcParams['legend.frameon'] = False
 
 
-def get_colors(colormappings, colorlist, dataset, dataseries, items):
-    """This makes colors 'sticky' for the whole run."""
-
-    key = dataset + '_' + dataseries
-
-    # for each label item, assign the next color in the colorlist
-    # and save that for later.
-    outcolors = []
-    for item in items:
-        if item not in colormappings[key]:
-            colormappings[key][item] = colorlist[len(
-                colormappings[key]) % len(colorlist)]
-        outcolors.append(colormappings[key][item])
-
-    return outcolors
 
 
 def graph_timeseries(config, colormappings, params, dataframe):
@@ -253,7 +237,9 @@ def graph_releasebars(config, colormappings, params, dataframe):
 def main():
 
     config = toml.load("config.toml")
-    colormappings = defaultdict(OrderedDict)
+
+
+    colormappings = load_color_cache(config['color_cache'],config['color_presets'])
 
     database = sqlite3.connect(DATAFILE, detect_types=sqlite3.PARSE_DECLTYPES)
 
diff --git a/brontosaurus_colorizer.py b/brontosaurus_colorizer.py
new file mode 100755
index 0000000..878f4dc
--- /dev/null
+++ b/brontosaurus_colorizer.py
@@ -0,0 +1,114 @@
+#!/usr/bin/python3
+"""
+This script goes through the database and pre-sets colors for each combination.
+
+This is saved to 
+"""
+
+from pprint import pprint
+import sqlite3
+
+from collections import defaultdict
+from collections import OrderedDict
+
+
+import toml
+import re
+
+
+
+
+def get_colors(colormappings, colorlist, dataset, dataseries, items):
+    """This makes colors 'sticky' for the whole run."""
+
+    key = dataset + '.' + dataseries
+
+    # for each label item, assign the next color in the colorlist
+    # and save that for later.
+    outcolors = []
+    for item in items:
+        if item not in colormappings[key]:
+            colormappings[key][str(item)] = colorlist[len(colormappings[key]) % len(colorlist)]
+        outcolors.append(colormappings[key][str(item)])
+
+    return outcolors
+
+
+def load_color_cache(cachefile,presetfile):
+
+    colormappings = defaultdict(OrderedDict)
+    try:
+        cached = toml.load(cachefile)
+    except FileNotFoundError:
+        print(f"Can't open color cache {cachefile}, so starting fresh.")
+        cached = {}
+
+    try:
+        presets = toml.load(presetfile)
+        cached.update(presets)
+    except FileNotFoundError:
+        print(f"No color preset file {presetfile} found.")
+        
+
+    # gotta do this because we want a defaultdict but
+    # toml load just gives us a regular dict.
+    for key in cached.keys():
+        colormappings[key] = cached[key].copy()
+
+    return colormappings
+        
+
+def main():
+
+    config = toml.load("config.toml")
+
+    colormappings = load_color_cache(config['color_cache'],config['color_presets'])
+
+    database = sqlite3.connect(config['datafile'], detect_types=sqlite3.PARSE_DECLTYPES)
+    cur = database.cursor()
+
+
+    cur.execute("SELECT dataset FROM checkins GROUP BY dataset")
+    datasets = [t[0] for t in cur.fetchall()]
+    cur.execute("SELECT * FROM checkins LIMIT 1").fetchall()
+    dataserieses = [t[0] for t in cur.description]
+    dataserieses.remove('week')
+    dataserieses.remove('dataset')
+    dataserieses.remove('hits')
+
+
+
+    for dataset in datasets:
+        if not re.match('^[0-9a-z_]*$', dataset):
+            print(f"Bad dataset name! '%{dataset}")
+            exit(1)
+        for dataseries in dataserieses:
+            if not re.match('^[0-9a-z_]*$', dataseries):
+                print(f"Bad dataseries name! '%{dataseries}")
+                exit(1)
+
+            if dataseries == 'age':
+                order=""
+            elif dataseries == 'release':
+                order="ORDER BY release DESC"
+            else:
+                order='ORDER BY total DESC'
+            query = f"""SELECT {dataseries},sum(hits) AS TOTAL
+                            FROM checkins
+                            WHERE dataset = '{dataset}'
+                            GROUP BY {dataseries}
+                            {order}
+                    """
+            cur.execute(query)
+            items = [t[0] for t in cur.fetchall()]
+            #print(dataset,dataseries,items)
+            
+            get_colors(colormappings, config['colors'], dataset, dataseries, items)
+
+
+    with open(config['color_cache'], "w") as toml_file:
+        toml.dump(colormappings, toml_file)  
+
+
+if __name__ == "__main__":
+    main()
diff --git a/color-presets.toml b/color-presets.toml
new file mode 100644
index 0000000..dd54edb
--- /dev/null
+++ b/color-presets.toml
@@ -0,0 +1,8 @@
+["epel.variant"]
+"CentOS Linux" = "#101010"
+"Red Hat Enterprise Linux" = "#ee0000"
+"CentOS Stream" = "#a14a8c"
+"Rocky Linux" = "#10b981"
+"AlmaLinux" = "#ffcc0a"
+"Oracle Linux Server" = "#aaaaaa"
+"CloudLinux" = "#0097f3"
diff --git a/config.toml b/config.toml
index fb53367..01163b4 100644
--- a/config.toml
+++ b/config.toml
@@ -1,3 +1,8 @@
+datafile = "db/bronto.db"
+
+color_presets = "color-presets.toml"
+color_cache = "db/color-cache.toml"
+
 ephemeral = "all"
 
 figsize = [16, 9]
@@ -24,6 +29,7 @@ colors = [
     '#aad0ee',
     '#101010',
     '#535961',
+    '#808080',
 ]
 
 # could be png, pdf, svg
diff --git a/run.sh b/run.sh
index 41cfa7d..09b84b5 100755
--- a/run.sh
+++ b/run.sh
@@ -84,6 +84,11 @@ echo "* Creating cages for different exhibits..."
   done
 echo "  Built!"
 
+echo "* Painting the feathers..."
+ rm db/color-cache.toml 2> /dev/null
+ ./brontosaurus-colorizer.py
+echo "  Vibrant!"
+
 echo "* Drawing portraits from the fossilized remains... "
  LINES=$(toml -dj < config.toml |jq '.timeseries[].views.value[].value'|wc -l)
  ./brontosaurus-plotter.py | pv -F "  %p %e" -w60 -l -s $LINES > /dev/null

From d1c81e9c8ae9178b789224edefd0d082a314ecbd Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 23 2022 21:55:01 +0000
Subject: [PATCH 15/49] underscore instead of hyphen


---

diff --git a/brontosaurus-egg-sorter.py b/brontosaurus-egg-sorter.py
deleted file mode 100755
index e8dff2e..0000000
--- a/brontosaurus-egg-sorter.py
+++ /dev/null
@@ -1,277 +0,0 @@
-#!/usr/bin/python3
-
-# Written by Matthew Miller
-
-# This estimates the number of systems of age 1 which turn into long-running
-# systems and which are just ephemeral test installs, ci system instances,
-# builders, etc., and relabels the latter as "age=0".
-#
-# The basic concept is: a system reporting as "one" could either be
-# persistent (that is, goes into "two" next week) or ephemeral (doesn't).
-# Any increase in group two [across versions] _must_ be from systems coming
-# from group one. Therefore, at least that number of systems from "one" must
-# be "persistent". However, that can undercount if systems from group two go
-# offline — it may be that there are more persistent systems than we
-# thought.
-#
-# We can mitigate this a little bit by also considering the flow to groups
-# three and four, working backwards, like this:
-#
-#   one,two,three,four = this weeks age values
-#   one1,two1,three1,four1 = next week's age values
-#
-#   # If four1-four is negative, older more systems are going offline then
-#   # aging into it. Increases must come from group three, or ghosts
-#   # (ignored; see below). There may be _more_ from three to four, propping
-#   # up the value as other systems go offline, but it can't be more than
-#   # the total increase
-#   min_flow_to_four = max(four1-four,0) # time only goes forward
-#   max_flow_to_four = min(three,four1)  # could be _total turnover_
-#
-#   # Without any flow from two, three would go down by the range above.
-#   # How much _did_ we go down by (if any?) Anything above that must
-#   # be flow from two (or ghosts!)
-#   min_flow_to_three = max(three1-three,0) + min_flow_to_four
-#   # max is what's there in the next week _plus_ what could have
-#   # moved on.
-#   max_flow_to_three = min(two,three1 + max_flow_to_four)
-#
-#   # Same deal, but one cohort over...
-#   min_flow_to_two = max(two1-two,0) + min_flow_to_three
-#   max_flow_to_two = min(one,two1 + max_flow_to_three)
-#
-#   # Leaving us with ...
-#
-#   min_ephemeral (zero) = one - max_flow_to_two
-#   min_persistent (one) = min(one,min_flow_to_two)
-#
-#   max_ephemeral (zero) = one - min(one,min_flow_to_two)
-#   max_persistent (one) = max_flow_to_two
-#
-#   # split?
-#
-# or
-#
-#   moved_up=min(one,max(two1-two,0) + max(three1-three,0) + max(four1-four,0))
-#   zero=one-moved_up
-#   one=moved_up
-#
-#    *or*
-#
-#   moved_up=min(one,two1 + min(two,three1 + min(three,four1)))
-#   zero=one-moved_up
-#   one=moved_up
-#
-# Of course, this assumes that random new systems won't show up in
-# later groups — ghosts! Right now, I'm assuming they're rare enough
-# to ignore for the purpose of this estimation. Theoretically, ghost
-# systems mean that the minimum estimate is actually too high.
-#
-# Also: since age 1 is 1 week, all systems must move up (or vanish).
-# Age 2 is 3 weeks (weeks 2,3,4), so assuming that the average number
-# of new permanant installations is roughly smooth week to week (no
-# huge jumps) on some days, that means we can assume turnover of 1/3 for
-# that group. Age 3 is 20 weeks (weeks 5-24, inclusive), so turnover should
-# be much smaller — more like 1/20!
-#
-# Also. this it keeps track of the percentage of a given system type that is
-# ephemeral, and uses that to guess for missing values (like, the last week
-# in the dataset, where there is no "next week" yet.)
-#
-# To consider: bias towards upgrades when a new release is just out? or does
-# that get us too much seeing what we want to see? We could deterimine this
-# by the first week systems seen breaks some threshold, or more cleverly
-# by noticing when the curve jumps.
-#
-
-import os
-import sys
-import string
-import sqlite3
-import datetime
-from collections import Counter
-from tokenize import group
-
-DATAFILE = 'db/bronto.db'
-
-DATABASE = sqlite3.connect(DATAFILE, detect_types=sqlite3.PARSE_DECLTYPES)
-
-loopcursor = DATABASE.cursor()
-nextcursor = DATABASE.cursor()
-
-zerocounter = Counter()
-onecounter = Counter()
-
-
-loopcursor.execute(
-    "SELECT DISTINCT(dataset) FROM checkins ORDER BY dataset DESC")
-groups = [item for sublist in loopcursor.fetchall() for item in sublist]
-
-
-for group in groups:
-    zerocounter.clear()
-    onecounter.clear()
-
-    loopcursor.execute(
-        'SELECT * FROM checkins WHERE dataset = :dataset AND age = 1 ORDER BY week', {'dataset': group})
-    for row in loopcursor:
-        (week, group, release, variant, arch, age, hits) = row
-
-        thisone = hits
-
-        # get the other age groups for this type of system, if any
-        query = """SELECT age,hits FROM checkins WHERE
-                              week = :week AND
-                              dataset = :dataset AND
-                              release = :release AND
-                              variant = :variant AND
-                              arch = :arch AND
-                              age > 1
-                              ORDER BY age
-                              """
-        nextcursor.execute(query,
-                           {"week": week,
-                            "dataset": group,
-                            "release": release,
-                            "variant": variant,
-                            "arch": arch
-                            })
-        thisweek = nextcursor.fetchall()
-        twothis = 0
-        threethis = 0
-        fourthis = 0
-        if thisweek:
-            for agegroup in thisweek:
-                (age, hits) = agegroup
-                if age == 2:
-                    twothis = hits
-                elif age == 3:
-                    threethis = hits
-                elif age == 4:
-                    fourthis = hits
-                else:
-                    raise ValueError()
-
-        # Get the age groups for next week (if any) for this type of system
-        # For Fedora Linux, we're also including higher release numbers --
-        # systems could be upgraded! However, that's unlikely for epel. So..
-        if group.split('_', 1)[0] == 'fedora':
-            query = """SELECT age,sum(hits) FROM checkins WHERE
-                              week = :nextweek AND
-                              dataset = :dataset AND
-                              release >= :release AND
-                              variant = :variant AND
-                              arch = :arch AND
-                              age > 1
-                              GROUP BY age
-                              ORDER BY age"""
-        else:
-            query = """SELECT age,sum(hits) FROM checkins WHERE
-                              week = :nextweek AND
-                              dataset = :dataset AND
-                              release = :release AND
-                              variant = :variant AND
-                              arch = :arch AND
-                              age > 1
-                              GROUP BY age
-                              ORDER BY age"""
-        nextcursor.execute(query,
-                           {"nextweek": (datetime.datetime.fromisoformat(week) + datetime.timedelta(weeks=1)).strftime("%Y-%m-%d"),  # this was easier when it was weeknums!
-                            "dataset": group,
-                            "release": release,
-                            "variant": variant,
-                            "arch": arch
-                            })
-        nextweek = nextcursor.fetchall()
-
-        if nextweek:
-
-            twonext = 0
-            threenext = 0
-            fournext = 0
-
-            for agegroup in nextweek:
-                (age, hits) = agegroup
-                if age == 2:
-                    twonext = hits
-                elif age == 3:
-                    threenext = hits
-                elif age == 4:
-                    fournext = hits
-                else:
-                    raise ValueError(f"age is {age}")
-
-            # okay, whew. see long comment at top of file for explanation of the theory.
-            # in practice, there are these three possibilities:
-            # moved_up_min = max(twonext-twothis, 0) +
-            #                max(threenext - threethis, 0) +
-            #                max(fournext-fourthis, 0)
-            # moved_up_max = twonext + min(twothis, threenext + min(threethis, fournext))
-            #
-            # moved_up_timebased = int(twothis/3 + threethis/20 + [some estimate of dropout rate of fourthis])
-            #
-            # So what I'm doing here is for each period, going with time-based capped by
-            # the min and max flow for that age group. And then for age 4, arbitrarily picking
-            # a ratio. Note that min and timebased separate easily, but the max is difficult
-            #
-            # For now, we're assuming _minimum_ flow from age 3 into age 4.
-            # Why? This makes the bands for age 1 and age 2 look most reasonable.
-            moved_to_four = max(fournext-fourthis, 0)
-            moved_to_three = min(max(threethis/20, max(threenext-threethis, 0)+moved_to_four),
-                                 min(twothis, threenext + moved_to_four))
-            moved_to_two = min(max(
-                twothis/3, max(twonext-twothis, 0) + moved_to_three), min(thisone, twonext + moved_to_three))
-            moved_up = min(thisone, int(moved_to_two))
-            new_zero = thisone - moved_up
-            assert(new_zero >= 0)
-            new_one = moved_up
-            assert (new_one == thisone-new_zero)
-
-            # keep a running total for the estimate
-            zerocounter[(release, variant, arch)] += new_zero
-            onecounter[(release, variant, arch)] += new_one
-
-        else:  # no values for next week, so... estimate!
-            totalprevious = zerocounter[(
-                release, variant, arch)] + onecounter[(release, variant, arch)]
-            if totalprevious:
-                new_zero = round(
-                    thisone*zerocounter[(release, variant, arch)]/totalprevious)
-                new_one = thisone-new_zero
-            else:
-                # no estimate for this row, so assume all ephemeral
-                new_zero = thisone
-                new_one = 0
-
-        assert new_zero + \
-            new_one == thisone, "{} + {} =  {}".format(
-                new_zero, new_one, thisone)
-
-        nextcursor.execute("""INSERT INTO checkins
-                              (week, dataset, release, variant, arch, age, hits)
-                              VALUES (:week, :dataset, :release, :variant, :arch, :age, :hits)""",
-                           {"week": week,
-                            "dataset": group,
-                            "release": release,
-                            "variant": variant,
-                            "arch": arch,
-                            "age": 0,
-                            "hits": new_zero
-                            })
-        nextcursor.execute("""REPLACE INTO checkins
-                              (week, dataset, release, variant, arch, age, hits)
-                              VALUES (:week, :dataset, :release, :variant, :arch, :age, :hits)""",
-                           {"week": week,
-                            "dataset": group,
-                            "release": release,
-                            "variant": variant,
-                            "arch": arch,
-                            "age": 1,
-                            "hits": new_one
-                            })
-
-    # these are just clutter, and it's easier to zap them at the end
-    # than to avoid making them in the loop.
-    loopcursor.execute(f"""DELETE from checkins WHERE hits=0""")
-
-    DATABASE.commit()
diff --git a/brontosaurus-fight.sh b/brontosaurus-fight.sh
deleted file mode 100755
index 5726335..0000000
--- a/brontosaurus-fight.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-#
-# Create a view which only shows the weeks where each release
-# is at its peak. If someone actually is Good At SQL, I would
-# not mind help making this more clear.
-
-sqlite3 db/bronto.db << EOF
-    DROP VIEW IF EXISTS peak;
-    CREATE VIEW peak AS
-    SELECT checkins.week,
-           checkins.dataset,
-           checkins.release,
-           checkins.variant,
-           checkins.arch,
-           checkins.age,
-           checkins.hits
-    FROM checkins
-    INNER JOIN
-    (SELECT week,dataset,release,max(hits)
-        FROM (SELECT week,dataset,release,sum(hits) AS hits 
-        FROM  checkins
-        GROUP BY week,dataset,release
-        ORDER BY week) 
-    GROUP BY dataset,release) AS peaks
-    ON peaks.week = checkins.week
-       AND peaks.dataset = checkins.dataset 
-       AND peaks.release = checkins.release;
-EOF
diff --git a/brontosaurus-plotter.py b/brontosaurus-plotter.py
deleted file mode 100755
index 29ca465..0000000
--- a/brontosaurus-plotter.py
+++ /dev/null
@@ -1,299 +0,0 @@
-#!/usr/bin/python3
-
-import matplotlib.dates as dates
-import matplotlib.pyplot as plt
-
-import sqlite3
-from string import Template
-
-from pprint import pprint
-
-import pandas as pd
-import toml
-
-import matplotlib as m
-
-from brontosaurus_colorizer import load_color_cache, get_colors
-
-DATAFILE = 'db/bronto.db'
-
-m.use("Agg")
-
-m.style.use('seaborn-colorblind')
-m.rcParams['font.size'] = 12
-m.rcParams['font.family'] = 'Montserrat'
-m.rcParams['legend.frameon'] = False
-
-
-
-
-def graph_timeseries(config, colormappings, params, dataframe):
-    """Draws line or area chart for a dataseries over time."""
-
-    # If we find we have missing data, in the future:
-    # dataframe.resample('W-MON')
-
-    dataset = params['dataset']
-    dataseries = params['dataseries']
-
-    #################
-    # Instead of this, accumulate anything more than 10 into "other"
-    # ... and do it _elsewhere_ (easier to do before pivot anyway!)
-    #  + limit number of columns to 10 + other
-
-    hidelist = dataframe.div(dataframe.sum(
-        axis=1), axis=0).max() < 0.2/100
-    dataframe.drop(columns=(hidelist[hidelist == True].keys()), inplace=True)
-
-    ##################
-    # our colors.
-    # the complication here is keeping the same color for the same label
-    # across multiple graphs!
-    cmap = m.colors.ListedColormap(get_colors(colormappings, config['colors'],
-                                              dataset, dataseries, list(dataframe.columns)))
-
-    ##################
-    # and now.... graph it!
-
-    # FIXME: this is ugly
-    startdate = config['startdate'][dataset.split('_', 1)[0]]
-
-    for view in params['views']:
-
-        match view:
-            case 'line':
-                df = dataframe[startdate:]
-                kind = 'line'
-                colormap = cmap
-            case 'stacked':
-                df = dataframe[startdate:][dataframe.columns[::-1]]
-                kind = 'area'
-                colormap = m.colors.ListedColormap(cmap.colors[::-1])
-            case 'share':
-                df = dataframe[startdate:].div(
-                    dataframe.sum(axis=1), axis=0)*100
-                kind = 'area'
-                colormap = cmap
-
-                # Start the actual graph
-        graph = df.plot(figsize=config['figsize'],
-                        colormap=colormap, kind=kind)
-
-        # Labels and titles and stuff.
-        ax = plt.gca()
-
-        handles, labels = ax.get_legend_handles_labels()
-
-        # TODO: generalize this
-        if dataseries == 'age':
-            labels = list(map(config['age_labels'].get, labels))
-
-        if view == 'stacked':
-            handles[:] = handles[::-1]
-            labels[:] = labels[::-1]
-
-        plt.legend(handles, labels, loc='center left',
-                   bbox_to_anchor=(1.0, 0.5))
-
-        madlibs = {'dataseries': dataseries,
-                   'dataset': dataset,
-                   'view': view,
-                   'dataseries_label': config['dataseries_labels'][dataseries],
-                   'dataset_label': config['dataset_labels'][dataset],
-                   'view_label': config['view_labels'][view]}
-
-        if 'title' in params:
-            plt.suptitle(Template(params['title']).safe_substitute(madlibs),
-                         fontsize=24)
-
-        # FIX: make work
-        if 'subtitle' in params:
-            graph.set_title(
-                Template(params['subtitle']).safe_substitute(madlibs),
-                fontsize=14)
-
-        plt.autoscale(enable=True, axis='x', tight=True)
-        plt.autoscale(enable=True, axis='y', tight=False)
-        graph.set_ylim([0, None])
-        graph.spines['right'].set_visible(False)
-        graph.spines['top'].set_visible(False)
-        sFormatter = m.ticker.ScalarFormatter()
-        sFormatter.set_scientific(False)
-        graph.yaxis.set_major_formatter(sFormatter)
-        # graph.xaxis.set_major_formatter(dates.DateFormatter('%b \'%y'))
-        graph.set_xlabel('')
-
-        for ext in config['image_types']:
-            graph.figure.savefig(f"images/{ext}/{dataset}/{Template(params['filebase']).safe_substitute(madlibs)}.{ext}",
-                                 dpi=config['dpi'], bbox_inches="tight")
-
-        plt.close(graph.figure)
-        print(f"{Template(params['filebase']).safe_substitute(madlibs)}.{ext}")
-
-
-def graph_releasebars(config, colormappings, params, dataframe):
-    """Draws each release in the set as a bar chart"""
-
-    # If we find we have missing data, in the future:
-    # dataframe.resample('W-MON')
-
-    dataset = params['dataset']
-    dataseries = params['dataseries']
-
-    #################
-    # Instead of this, accumulate anything more than 10 into "other"
-    # ... and do it _elsewhere_ (easier to do before pivot anyway!)
-    #  + limit number of columns to 10 + other
-
-    hidelist = dataframe.div(dataframe.sum(
-        axis=1), axis=0).max() < 0.2/100
-    dataframe.drop(columns=(hidelist[hidelist == True].keys()), inplace=True)
-
-    ##################
-    # our colors.
-    # the complication here is keeping the same color for the same label
-    # across multiple graphs!
-    cmap = m.colors.ListedColormap(get_colors(colormappings, config['colors'],
-                                              dataset, dataseries, list(dataframe.columns)))
-
-    ##################
-    # and now.... graph it!
-
-    # FIXME: this is ugly
-    startdate = config['startdate'][dataset.split('_', 1)[0]]
-
-    for view in params['views']:
-
-        match view:
-            case 'stacked':
-                df = dataframe[startdate:]
-                kind = 'bar'
-                colormap = cmap
-            case 'share':
-                df = dataframe[startdate:].div(
-                    dataframe.sum(axis=1), axis=0)*100
-                kind = 'bar'
-                colormap = cmap
-
-        # Start the actual graph
-        graph = df.plot(figsize=config['figsize'],
-                        colormap=colormap, kind=kind, stacked=True)
-
-        # Labels and titles and stuff.
-        ax = plt.gca()
-
-        handles, labels = ax.get_legend_handles_labels()
-
-        # TODO: generalize this
-        if dataseries == 'age':
-            labels = list(map(config['age_labels'].get, labels))
-
-        if view == 'stacked':
-            handles[:] = handles[::-1]
-            labels[:] = labels[::-1]
-
-        plt.legend(handles, labels, loc='center left',
-                   bbox_to_anchor=(1.0, 0.5))
-
-        madlibs = {'dataseries': dataseries,
-                   'dataset': dataset,
-                   'view': view,
-                   'dataseries_label': config['dataseries_labels'][dataseries],
-                   'dataset_label': config['dataset_labels'][dataset],
-                   'view_label': config['view_labels'][view]}
-
-        if 'title' in params:
-            plt.suptitle(Template(params['title']).safe_substitute(madlibs),
-                         fontsize=24)
-
-        # FIX: make work
-        if 'subtitle' in params:
-            graph.set_title(
-                Template(params['subtitle']).safe_substitute(madlibs),
-                fontsize=14)
-
-        plt.autoscale(enable=True, axis='x', tight=True)
-        plt.autoscale(enable=True, axis='y', tight=False)
-        graph.set_ylim([0, None])
-        graph.spines['right'].set_visible(False)
-        graph.spines['top'].set_visible(False)
-        sFormatter = m.ticker.ScalarFormatter()
-        sFormatter.set_scientific(False)
-        graph.yaxis.set_major_formatter(sFormatter)
-        # graph.xaxis.set_major_formatter(dates.DateFormatter('%b \'%y'))
-        graph.set_xlabel('')
-
-        for ext in config['image_types']:
-            graph.figure.savefig(f"images/{ext}/{dataset}/{Template(params['filebase']).safe_substitute(madlibs)}.{ext}",
-                                 dpi=config['dpi'], bbox_inches="tight")
-
-        plt.close(graph.figure)
-        print(f"{Template(params['filebase']).safe_substitute(madlibs)}.{ext}")
-
-
-###########################################
-
-
-def main():
-
-    config = toml.load("config.toml")
-
-
-    colormappings = load_color_cache(config['color_cache'],config['color_presets'])
-
-    database = sqlite3.connect(DATAFILE, detect_types=sqlite3.PARSE_DECLTYPES)
-
-    if 'timeseries' in config:
-        for timeseries in config['timeseries']:
-            params = config['timeseries_defaults'].copy()
-            params.update(timeseries)
-
-            query = f"""SELECT week, {params['dataseries']}, SUM(hits) as hits
-                    FROM checkins
-                    WHERE dataset =\"{params['dataset']}\"
-                    {params['extraselect']}
-                    GROUP BY week, {params['dataseries']}
-                    ORDER BY week
-                """
-            df = pd.read_sql_query(query, parse_dates='week',   con=database)
-
-            graph_timeseries(
-                config=config,
-                colormappings=colormappings,
-                params=params,
-                dataframe=df.pivot(index='week', columns=params['dataseries'],
-                                values='hits').astype("Int64"),
-            )
-
-    if 'byrelease' in config:
-        for byrelease in config['byrelease']:
-            params = config['byrelease_defaults'].copy()
-            params.update(byrelease)
-
-            query = f"""SELECT release, {params['dataseries']}, SUM(hits) as hits
-                    FROM peak
-                    WHERE dataset =\"{params['dataset']}\"
-                    {params['extraselect']}
-                    GROUP BY release, {params['dataseries']}
-                    ORDER BY release
-                """
-            df = pd.read_sql_query(query, con=database)
-
-            graph_releasebars(
-                config=config,
-                colormappings=colormappings,
-                params=params,
-                dataframe=df.pivot(index='release', columns=params['dataseries'],
-                                values='hits').astype("Int64"),
-            )
-
-
-'''
-### getting ahead of myself: this is for the waffle charts
-query = """SELECT * from checkins where week >= (SELECT DISTINCT(week) FROM checkins ORDER BY week DESC LIMIT 4,1)"""
-df = pd.read_sql_query(query, parse_dates='week', con=database)
-df
-'''
-
-if __name__ == "__main__":
-    main()
diff --git a/brontosaurus-slicer.sh b/brontosaurus-slicer.sh
deleted file mode 100755
index 8235990..0000000
--- a/brontosaurus-slicer.sh
+++ /dev/null
@@ -1,129 +0,0 @@
-#!/bin/bash
-
-# This script implements the filter rules in NOTES.md, and splits
-# the totals.db into a new cleaned-up table in a "bronto.db" file.
-#
-# It splits the records into major groups: EPEL, and then also
-# "main" Fedora Linux systems, Fedora Rawhide, and Fedora Containers.
-# Because "group" is a reserved word in sql, we use "dataset".
-#
-# It removes the os_ prefix, because without repo_ columns there
-# is no ambiguity to resolve.
-#
-# And, it converts weeknums (which start on January 5th, 1970,
-# the first Monday of the epoch) to dates.
-#
-# We're dropping the os_version field from EL and instead just
-# using the first digit (e.g. "8" or "9") as "release". We are
-# also using _name_ as variant. We could keep these separate,
-# but this way we have the same fields for both types.
-
-
-sqlite3 db/totals.db << EOF
-
-ATTACH DATABASE 'db/bronto.db' AS bronto;
-
-DROP TABLE IF EXISTS bronto.checkins;
-
-CREATE TABLE bronto.checkins(
-  week INT,
-  dataset TEXT,
-  release TEXT,
-  variant TEXT,
-  arch TEXT,
-  age INT CHECK(age<5),
-  hits INT,
-  UNIQUE (week,dataset,release,variant,arch,age)
-);
-
-INSERT INTO bronto.checkins
-    SELECT date(julianday('1970-01-05')+ weeknum * 7) as week,
-           "fedora_updates_systems" AS dataset,
-           os_version AS release,
-           os_variant AS variant,
-           os_arch AS arch,
-           sys_age AS age,
-           SUM(hits)
-        FROM countme_totals
-        WHERE (os_name = 'Fedora' or os_name = 'Fedora Linux')
-        AND repo_tag LIKE 'updates-released-f__'
-        AND os_version = substr(repo_tag,-2,2)
-        AND os_arch = repo_arch
-        AND os_variant != 'container'
-        AND os_variant != 'toolbx'
-        AND os_variant != 'snappy'
-        GROUP BY week,release,variant,arch,age;
-
-INSERT INTO bronto.checkins
-    SELECT date(julianday('1970-01-05')+ weeknum * 7) as week,
-           "fedora_updates_containers" AS dataset,
-           os_version AS release,
-           os_variant AS variant,
-           os_arch AS arch,
-           sys_age AS age,
-           SUM(hits)
-        FROM countme_totals
-        WHERE (os_name = 'Fedora' or os_name = 'Fedora Linux')
-        AND repo_tag LIKE 'updates-released-f__'
-        AND os_version = substr(repo_tag,-2,2)
-        AND os_arch = repo_arch
-        AND (os_variant = 'container' OR
-             os_variant = 'toolbx' OR
-             os_variant = 'snappy')
-        GROUP BY week,release,variant,arch,age;
-
-
-INSERT INTO bronto.checkins
-    SELECT date(julianday('1970-01-05')+ weeknum * 7) as week,
-           "fedora_rawhide_systems" AS dataset,
-           os_version AS release,
-           os_variant AS variant,
-           os_arch AS arch,
-           sys_age AS age,
-           SUM(hits)
-        FROM countme_totals
-        WHERE (os_name = 'Fedora' or os_name = 'Fedora Linux')
-        AND repo_tag = 'fedora-rawhide' OR repo_tag = 'rawhide'
-        AND os_arch = repo_arch
-        AND os_variant != 'container'
-        AND os_variant != 'toolbx'
-        AND os_variant != 'snappy'
-        GROUP BY week,release,variant,arch,age;
-
-INSERT INTO bronto.checkins
-    SELECT date(julianday('1970-01-05')+ weeknum * 7) as week,
-           "fedora_rawhide_containers" AS dataset,
-           os_version AS release,
-           os_variant AS variant,
-           os_arch AS arch,
-           sys_age AS age,
-           SUM(hits)
-        FROM countme_totals
-        WHERE (os_name = 'Fedora' or os_name = 'Fedora Linux')
-        AND repo_tag  = 'rawhide'
-        AND os_arch = repo_arch
-        AND (os_variant = 'container' OR
-             os_variant = 'toolbx' OR
-             os_variant = 'snappy')
-        GROUP BY week,release,variant,arch,age;
-
-
-INSERT INTO bronto.checkins
-    SELECT date(julianday('1970-01-05')+ weeknum * 7) as week,
-           "epel" AS dataset,
-           CASE instr(os_version,".") 
-               WHEN 0 THEN os_version 
-               ELSE substr(os_version,0,instr(os_version,".")) 
-           END AS release,
-           os_name AS variant,
-           os_arch AS arch,
-           sys_age AS age,
-           SUM(hits)
-           FROM countme_totals
-           WHERE repo_tag = 'epel-' || release
-           AND os_arch = repo_arch
-           GROUP BY week,release,variant,arch,age;
-
-DETACH bronto;
-
-EOF
diff --git a/brontosaurus-washer.sh b/brontosaurus-washer.sh
deleted file mode 100755
index b655cb7..0000000
--- a/brontosaurus-washer.sh
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/bin/bash
-#
-# For every table in bronto.db, delete "known bad" variants.
-#
-
-
-# Please document each new reason for cleaning something here, so we know
-# why later.
-#
-# The variants 09975472-cc15-4020-9231-cc6743a15b0f and c1e9af0e-5816-4644-9c89-703aa1fdcf04
-# are strings I (mattdm) used for testing.
-#
-# The variant "CentOS Stream v21.*" is some sort of horrible scripted thing
-# apparently, where it keeps getting longer and longer with additional tags
-#
-# Also, for each table, sets any variant that is '' to 'none', because
-# '' is hard to work with. (I think this is when people have manually put
-# "VARIANT_ID=", as opposed to not having one. I don't think that's useful
-# to track separately from 'generic', really, so an alternative would be
-# to merge them... but doing this for now.)
-# FIXME: this needs to merge them in case someone starts actually sending
-# "none" as the string — we'll get a uniqueness constraint violation.
-#
-# This is a regex, in case that's not clear.
-GARBAGE_VARIANTS="^(09975472-cc15-4020-9231-cc6743a15b0f|c1e9af0e-5816-4644-9c89-703aa1fdcf04|CentOS Stream v21.*)$" 
-
-sqlite3 db/bronto.db << EOF
-  DELETE FROM checkins WHERE variant REGEXP "$GARBAGE_VARIANTS";
-  UPDATE checkins SET variant='none' WHERE variant='';
-EOF
-
-
-# While some test systems ran Fedora Linux 31, the feature landed
-# in 32 (released 2020-04-27, so drop all the old stuff.
-FEDORA_STARTVER=32
-FEDORA_STARTDAY='2021-01-01'
-# And same for EPEL, with EL 8 (DNF feature launched in 8.3 at end of 2020)
-EPEL_STARTVER=8
-EPEL_STARTDAY='2021-01-01'
-sqlite3 db/bronto.db << EOF
-  DELETE FROM checkins WHERE dataset GLOB "fedora*" AND release < $FEDORA_STARTVER;
-  DELETE FROM checkins WHERE dataset GLOB "fedora*" AND week < "$FEDORA_STARTDAY";
-  DELETE FROM checkins WHERE dataset GLOB "epel*"   AND release < $EPEL_STARTVER;
-  DELETE FROM checkins WHERE dataset GLOB "epel*"   AND week < "$EPEL_STARTDAY";
-EOF
-
-# Clean up entries for name, arch, or release that show up
-# with less than some threshold in _total_ hits in the
-# whole database, or where the weekly numbers never exceed some
-# small value. This removes both small bursts of nonsense
-# and also most long-lived singletons. We may want to revisit
-# what these are set to when we have more data.
-#
-# Note that since we regenerate the whole db from totals.db 
-# each week, if something exceeds this threshold later, it will
-# suddenly appear
-THRESHOLD_TOTAL=100
-THRESHOLD_WEEKLY=3
-
-for GROUP in $(echo 'SELECT DISTINCT(dataset) FROM checkins;' | sqlite3 ./db/bronto.db); do
-   sqlite3 db/bronto.db << EOF
-     DELETE FROM checkins WHERE dataset = "$GROUP" AND variant IN (SELECT variant FROM checkins WHERE dataset = "$GROUP" GROUP BY variant HAVING SUM(hits) < $THRESHOLD_TOTAL);
-     DELETE FROM checkins WHERE dataset = "$GROUP" AND variant IN (SELECT variant FROM checkins WHERE dataset = "$GROUP" GROUP BY variant HAVING SUM(hits) < $THRESHOLD_WEEKLY);
-     DELETE FROM checkins WHERE dataset = "$GROUP" AND arch    IN (SELECT arch    FROM checkins WHERE dataset = "$GROUP" GROUP BY arch    HAVING SUM(hits) < $THRESHOLD_TOTAL);
-     DELETE FROM checkins WHERE dataset = "$GROUP" AND arch    IN (SELECT arch    FROM checkins WHERE dataset = "$GROUP" GROUP BY arch    HAVING SUM(hits) < $THRESHOLD_WEEKLY);
-     DELETE FROM checkins WHERE dataset = "$GROUP" AND release IN (SELECT release FROM checkins WHERE dataset = "$GROUP" GROUP BY release HAVING SUM(hits) < $THRESHOLD_TOTAL);
-     DELETE FROM checkins WHERE dataset = "$GROUP" AND release IN (SELECT release FROM checkins WHERE dataset = "$GROUP" GROUP BY release HAVING SUM(hits) < $THRESHOLD_WEEKLY);
-EOF
-done
diff --git a/brontosaurus_egg-sorter.py b/brontosaurus_egg-sorter.py
new file mode 100755
index 0000000..e8dff2e
--- /dev/null
+++ b/brontosaurus_egg-sorter.py
@@ -0,0 +1,277 @@
+#!/usr/bin/python3
+
+# Written by Matthew Miller
+
+# This estimates the number of systems of age 1 which turn into long-running
+# systems and which are just ephemeral test installs, ci system instances,
+# builders, etc., and relabels the latter as "age=0".
+#
+# The basic concept is: a system reporting as "one" could either be
+# persistent (that is, goes into "two" next week) or ephemeral (doesn't).
+# Any increase in group two [across versions] _must_ be from systems coming
+# from group one. Therefore, at least that number of systems from "one" must
+# be "persistent". However, that can undercount if systems from group two go
+# offline — it may be that there are more persistent systems than we
+# thought.
+#
+# We can mitigate this a little bit by also considering the flow to groups
+# three and four, working backwards, like this:
+#
+#   one,two,three,four = this weeks age values
+#   one1,two1,three1,four1 = next week's age values
+#
+#   # If four1-four is negative, older more systems are going offline then
+#   # aging into it. Increases must come from group three, or ghosts
+#   # (ignored; see below). There may be _more_ from three to four, propping
+#   # up the value as other systems go offline, but it can't be more than
+#   # the total increase
+#   min_flow_to_four = max(four1-four,0) # time only goes forward
+#   max_flow_to_four = min(three,four1)  # could be _total turnover_
+#
+#   # Without any flow from two, three would go down by the range above.
+#   # How much _did_ we go down by (if any?) Anything above that must
+#   # be flow from two (or ghosts!)
+#   min_flow_to_three = max(three1-three,0) + min_flow_to_four
+#   # max is what's there in the next week _plus_ what could have
+#   # moved on.
+#   max_flow_to_three = min(two,three1 + max_flow_to_four)
+#
+#   # Same deal, but one cohort over...
+#   min_flow_to_two = max(two1-two,0) + min_flow_to_three
+#   max_flow_to_two = min(one,two1 + max_flow_to_three)
+#
+#   # Leaving us with ...
+#
+#   min_ephemeral (zero) = one - max_flow_to_two
+#   min_persistent (one) = min(one,min_flow_to_two)
+#
+#   max_ephemeral (zero) = one - min(one,min_flow_to_two)
+#   max_persistent (one) = max_flow_to_two
+#
+#   # split?
+#
+# or
+#
+#   moved_up=min(one,max(two1-two,0) + max(three1-three,0) + max(four1-four,0))
+#   zero=one-moved_up
+#   one=moved_up
+#
+#    *or*
+#
+#   moved_up=min(one,two1 + min(two,three1 + min(three,four1)))
+#   zero=one-moved_up
+#   one=moved_up
+#
+# Of course, this assumes that random new systems won't show up in
+# later groups — ghosts! Right now, I'm assuming they're rare enough
+# to ignore for the purpose of this estimation. Theoretically, ghost
+# systems mean that the minimum estimate is actually too high.
+#
+# Also: since age 1 is 1 week, all systems must move up (or vanish).
+# Age 2 is 3 weeks (weeks 2,3,4), so assuming that the average number
+# of new permanant installations is roughly smooth week to week (no
+# huge jumps) on some days, that means we can assume turnover of 1/3 for
+# that group. Age 3 is 20 weeks (weeks 5-24, inclusive), so turnover should
+# be much smaller — more like 1/20!
+#
+# Also. this it keeps track of the percentage of a given system type that is
+# ephemeral, and uses that to guess for missing values (like, the last week
+# in the dataset, where there is no "next week" yet.)
+#
+# To consider: bias towards upgrades when a new release is just out? or does
+# that get us too much seeing what we want to see? We could deterimine this
+# by the first week systems seen breaks some threshold, or more cleverly
+# by noticing when the curve jumps.
+#
+
+import os
+import sys
+import string
+import sqlite3
+import datetime
+from collections import Counter
+from tokenize import group
+
+DATAFILE = 'db/bronto.db'
+
+DATABASE = sqlite3.connect(DATAFILE, detect_types=sqlite3.PARSE_DECLTYPES)
+
+loopcursor = DATABASE.cursor()
+nextcursor = DATABASE.cursor()
+
+zerocounter = Counter()
+onecounter = Counter()
+
+
+loopcursor.execute(
+    "SELECT DISTINCT(dataset) FROM checkins ORDER BY dataset DESC")
+groups = [item for sublist in loopcursor.fetchall() for item in sublist]
+
+
+for group in groups:
+    zerocounter.clear()
+    onecounter.clear()
+
+    loopcursor.execute(
+        'SELECT * FROM checkins WHERE dataset = :dataset AND age = 1 ORDER BY week', {'dataset': group})
+    for row in loopcursor:
+        (week, group, release, variant, arch, age, hits) = row
+
+        thisone = hits
+
+        # get the other age groups for this type of system, if any
+        query = """SELECT age,hits FROM checkins WHERE
+                              week = :week AND
+                              dataset = :dataset AND
+                              release = :release AND
+                              variant = :variant AND
+                              arch = :arch AND
+                              age > 1
+                              ORDER BY age
+                              """
+        nextcursor.execute(query,
+                           {"week": week,
+                            "dataset": group,
+                            "release": release,
+                            "variant": variant,
+                            "arch": arch
+                            })
+        thisweek = nextcursor.fetchall()
+        twothis = 0
+        threethis = 0
+        fourthis = 0
+        if thisweek:
+            for agegroup in thisweek:
+                (age, hits) = agegroup
+                if age == 2:
+                    twothis = hits
+                elif age == 3:
+                    threethis = hits
+                elif age == 4:
+                    fourthis = hits
+                else:
+                    raise ValueError()
+
+        # Get the age groups for next week (if any) for this type of system
+        # For Fedora Linux, we're also including higher release numbers --
+        # systems could be upgraded! However, that's unlikely for epel. So..
+        if group.split('_', 1)[0] == 'fedora':
+            query = """SELECT age,sum(hits) FROM checkins WHERE
+                              week = :nextweek AND
+                              dataset = :dataset AND
+                              release >= :release AND
+                              variant = :variant AND
+                              arch = :arch AND
+                              age > 1
+                              GROUP BY age
+                              ORDER BY age"""
+        else:
+            query = """SELECT age,sum(hits) FROM checkins WHERE
+                              week = :nextweek AND
+                              dataset = :dataset AND
+                              release = :release AND
+                              variant = :variant AND
+                              arch = :arch AND
+                              age > 1
+                              GROUP BY age
+                              ORDER BY age"""
+        nextcursor.execute(query,
+                           {"nextweek": (datetime.datetime.fromisoformat(week) + datetime.timedelta(weeks=1)).strftime("%Y-%m-%d"),  # this was easier when it was weeknums!
+                            "dataset": group,
+                            "release": release,
+                            "variant": variant,
+                            "arch": arch
+                            })
+        nextweek = nextcursor.fetchall()
+
+        if nextweek:
+
+            twonext = 0
+            threenext = 0
+            fournext = 0
+
+            for agegroup in nextweek:
+                (age, hits) = agegroup
+                if age == 2:
+                    twonext = hits
+                elif age == 3:
+                    threenext = hits
+                elif age == 4:
+                    fournext = hits
+                else:
+                    raise ValueError(f"age is {age}")
+
+            # okay, whew. see long comment at top of file for explanation of the theory.
+            # in practice, there are these three possibilities:
+            # moved_up_min = max(twonext-twothis, 0) +
+            #                max(threenext - threethis, 0) +
+            #                max(fournext-fourthis, 0)
+            # moved_up_max = twonext + min(twothis, threenext + min(threethis, fournext))
+            #
+            # moved_up_timebased = int(twothis/3 + threethis/20 + [some estimate of dropout rate of fourthis])
+            #
+            # So what I'm doing here is for each period, going with time-based capped by
+            # the min and max flow for that age group. And then for age 4, arbitrarily picking
+            # a ratio. Note that min and timebased separate easily, but the max is difficult
+            #
+            # For now, we're assuming _minimum_ flow from age 3 into age 4.
+            # Why? This makes the bands for age 1 and age 2 look most reasonable.
+            moved_to_four = max(fournext-fourthis, 0)
+            moved_to_three = min(max(threethis/20, max(threenext-threethis, 0)+moved_to_four),
+                                 min(twothis, threenext + moved_to_four))
+            moved_to_two = min(max(
+                twothis/3, max(twonext-twothis, 0) + moved_to_three), min(thisone, twonext + moved_to_three))
+            moved_up = min(thisone, int(moved_to_two))
+            new_zero = thisone - moved_up
+            assert(new_zero >= 0)
+            new_one = moved_up
+            assert (new_one == thisone-new_zero)
+
+            # keep a running total for the estimate
+            zerocounter[(release, variant, arch)] += new_zero
+            onecounter[(release, variant, arch)] += new_one
+
+        else:  # no values for next week, so... estimate!
+            totalprevious = zerocounter[(
+                release, variant, arch)] + onecounter[(release, variant, arch)]
+            if totalprevious:
+                new_zero = round(
+                    thisone*zerocounter[(release, variant, arch)]/totalprevious)
+                new_one = thisone-new_zero
+            else:
+                # no estimate for this row, so assume all ephemeral
+                new_zero = thisone
+                new_one = 0
+
+        assert new_zero + \
+            new_one == thisone, "{} + {} =  {}".format(
+                new_zero, new_one, thisone)
+
+        nextcursor.execute("""INSERT INTO checkins
+                              (week, dataset, release, variant, arch, age, hits)
+                              VALUES (:week, :dataset, :release, :variant, :arch, :age, :hits)""",
+                           {"week": week,
+                            "dataset": group,
+                            "release": release,
+                            "variant": variant,
+                            "arch": arch,
+                            "age": 0,
+                            "hits": new_zero
+                            })
+        nextcursor.execute("""REPLACE INTO checkins
+                              (week, dataset, release, variant, arch, age, hits)
+                              VALUES (:week, :dataset, :release, :variant, :arch, :age, :hits)""",
+                           {"week": week,
+                            "dataset": group,
+                            "release": release,
+                            "variant": variant,
+                            "arch": arch,
+                            "age": 1,
+                            "hits": new_one
+                            })
+
+    # these are just clutter, and it's easier to zap them at the end
+    # than to avoid making them in the loop.
+    loopcursor.execute(f"""DELETE from checkins WHERE hits=0""")
+
+    DATABASE.commit()
diff --git a/brontosaurus_fight.sh b/brontosaurus_fight.sh
new file mode 100755
index 0000000..5726335
--- /dev/null
+++ b/brontosaurus_fight.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+#
+# Create a view which only shows the weeks where each release
+# is at its peak. If someone actually is Good At SQL, I would
+# not mind help making this more clear.
+
+sqlite3 db/bronto.db << EOF
+    DROP VIEW IF EXISTS peak;
+    CREATE VIEW peak AS
+    SELECT checkins.week,
+           checkins.dataset,
+           checkins.release,
+           checkins.variant,
+           checkins.arch,
+           checkins.age,
+           checkins.hits
+    FROM checkins
+    INNER JOIN
+    (SELECT week,dataset,release,max(hits)
+        FROM (SELECT week,dataset,release,sum(hits) AS hits 
+        FROM  checkins
+        GROUP BY week,dataset,release
+        ORDER BY week) 
+    GROUP BY dataset,release) AS peaks
+    ON peaks.week = checkins.week
+       AND peaks.dataset = checkins.dataset 
+       AND peaks.release = checkins.release;
+EOF
diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py
new file mode 100755
index 0000000..29ca465
--- /dev/null
+++ b/brontosaurus_plotter.py
@@ -0,0 +1,299 @@
+#!/usr/bin/python3
+
+import matplotlib.dates as dates
+import matplotlib.pyplot as plt
+
+import sqlite3
+from string import Template
+
+from pprint import pprint
+
+import pandas as pd
+import toml
+
+import matplotlib as m
+
+from brontosaurus_colorizer import load_color_cache, get_colors
+
+DATAFILE = 'db/bronto.db'
+
+m.use("Agg")
+
+m.style.use('seaborn-colorblind')
+m.rcParams['font.size'] = 12
+m.rcParams['font.family'] = 'Montserrat'
+m.rcParams['legend.frameon'] = False
+
+
+
+
+def graph_timeseries(config, colormappings, params, dataframe):
+    """Draws line or area chart for a dataseries over time."""
+
+    # If we find we have missing data, in the future:
+    # dataframe.resample('W-MON')
+
+    dataset = params['dataset']
+    dataseries = params['dataseries']
+
+    #################
+    # Instead of this, accumulate anything more than 10 into "other"
+    # ... and do it _elsewhere_ (easier to do before pivot anyway!)
+    #  + limit number of columns to 10 + other
+
+    hidelist = dataframe.div(dataframe.sum(
+        axis=1), axis=0).max() < 0.2/100
+    dataframe.drop(columns=(hidelist[hidelist == True].keys()), inplace=True)
+
+    ##################
+    # our colors.
+    # the complication here is keeping the same color for the same label
+    # across multiple graphs!
+    cmap = m.colors.ListedColormap(get_colors(colormappings, config['colors'],
+                                              dataset, dataseries, list(dataframe.columns)))
+
+    ##################
+    # and now.... graph it!
+
+    # FIXME: this is ugly
+    startdate = config['startdate'][dataset.split('_', 1)[0]]
+
+    for view in params['views']:
+
+        match view:
+            case 'line':
+                df = dataframe[startdate:]
+                kind = 'line'
+                colormap = cmap
+            case 'stacked':
+                df = dataframe[startdate:][dataframe.columns[::-1]]
+                kind = 'area'
+                colormap = m.colors.ListedColormap(cmap.colors[::-1])
+            case 'share':
+                df = dataframe[startdate:].div(
+                    dataframe.sum(axis=1), axis=0)*100
+                kind = 'area'
+                colormap = cmap
+
+                # Start the actual graph
+        graph = df.plot(figsize=config['figsize'],
+                        colormap=colormap, kind=kind)
+
+        # Labels and titles and stuff.
+        ax = plt.gca()
+
+        handles, labels = ax.get_legend_handles_labels()
+
+        # TODO: generalize this
+        if dataseries == 'age':
+            labels = list(map(config['age_labels'].get, labels))
+
+        if view == 'stacked':
+            handles[:] = handles[::-1]
+            labels[:] = labels[::-1]
+
+        plt.legend(handles, labels, loc='center left',
+                   bbox_to_anchor=(1.0, 0.5))
+
+        madlibs = {'dataseries': dataseries,
+                   'dataset': dataset,
+                   'view': view,
+                   'dataseries_label': config['dataseries_labels'][dataseries],
+                   'dataset_label': config['dataset_labels'][dataset],
+                   'view_label': config['view_labels'][view]}
+
+        if 'title' in params:
+            plt.suptitle(Template(params['title']).safe_substitute(madlibs),
+                         fontsize=24)
+
+        # FIX: make work
+        if 'subtitle' in params:
+            graph.set_title(
+                Template(params['subtitle']).safe_substitute(madlibs),
+                fontsize=14)
+
+        plt.autoscale(enable=True, axis='x', tight=True)
+        plt.autoscale(enable=True, axis='y', tight=False)
+        graph.set_ylim([0, None])
+        graph.spines['right'].set_visible(False)
+        graph.spines['top'].set_visible(False)
+        sFormatter = m.ticker.ScalarFormatter()
+        sFormatter.set_scientific(False)
+        graph.yaxis.set_major_formatter(sFormatter)
+        # graph.xaxis.set_major_formatter(dates.DateFormatter('%b \'%y'))
+        graph.set_xlabel('')
+
+        for ext in config['image_types']:
+            graph.figure.savefig(f"images/{ext}/{dataset}/{Template(params['filebase']).safe_substitute(madlibs)}.{ext}",
+                                 dpi=config['dpi'], bbox_inches="tight")
+
+        plt.close(graph.figure)
+        print(f"{Template(params['filebase']).safe_substitute(madlibs)}.{ext}")
+
+
+def graph_releasebars(config, colormappings, params, dataframe):
+    """Draws each release in the set as a bar chart"""
+
+    # If we find we have missing data, in the future:
+    # dataframe.resample('W-MON')
+
+    dataset = params['dataset']
+    dataseries = params['dataseries']
+
+    #################
+    # Instead of this, accumulate anything more than 10 into "other"
+    # ... and do it _elsewhere_ (easier to do before pivot anyway!)
+    #  + limit number of columns to 10 + other
+
+    hidelist = dataframe.div(dataframe.sum(
+        axis=1), axis=0).max() < 0.2/100
+    dataframe.drop(columns=(hidelist[hidelist == True].keys()), inplace=True)
+
+    ##################
+    # our colors.
+    # the complication here is keeping the same color for the same label
+    # across multiple graphs!
+    cmap = m.colors.ListedColormap(get_colors(colormappings, config['colors'],
+                                              dataset, dataseries, list(dataframe.columns)))
+
+    ##################
+    # and now.... graph it!
+
+    # FIXME: this is ugly
+    startdate = config['startdate'][dataset.split('_', 1)[0]]
+
+    for view in params['views']:
+
+        match view:
+            case 'stacked':
+                df = dataframe[startdate:]
+                kind = 'bar'
+                colormap = cmap
+            case 'share':
+                df = dataframe[startdate:].div(
+                    dataframe.sum(axis=1), axis=0)*100
+                kind = 'bar'
+                colormap = cmap
+
+        # Start the actual graph
+        graph = df.plot(figsize=config['figsize'],
+                        colormap=colormap, kind=kind, stacked=True)
+
+        # Labels and titles and stuff.
+        ax = plt.gca()
+
+        handles, labels = ax.get_legend_handles_labels()
+
+        # TODO: generalize this
+        if dataseries == 'age':
+            labels = list(map(config['age_labels'].get, labels))
+
+        if view == 'stacked':
+            handles[:] = handles[::-1]
+            labels[:] = labels[::-1]
+
+        plt.legend(handles, labels, loc='center left',
+                   bbox_to_anchor=(1.0, 0.5))
+
+        madlibs = {'dataseries': dataseries,
+                   'dataset': dataset,
+                   'view': view,
+                   'dataseries_label': config['dataseries_labels'][dataseries],
+                   'dataset_label': config['dataset_labels'][dataset],
+                   'view_label': config['view_labels'][view]}
+
+        if 'title' in params:
+            plt.suptitle(Template(params['title']).safe_substitute(madlibs),
+                         fontsize=24)
+
+        # FIX: make work
+        if 'subtitle' in params:
+            graph.set_title(
+                Template(params['subtitle']).safe_substitute(madlibs),
+                fontsize=14)
+
+        plt.autoscale(enable=True, axis='x', tight=True)
+        plt.autoscale(enable=True, axis='y', tight=False)
+        graph.set_ylim([0, None])
+        graph.spines['right'].set_visible(False)
+        graph.spines['top'].set_visible(False)
+        sFormatter = m.ticker.ScalarFormatter()
+        sFormatter.set_scientific(False)
+        graph.yaxis.set_major_formatter(sFormatter)
+        # graph.xaxis.set_major_formatter(dates.DateFormatter('%b \'%y'))
+        graph.set_xlabel('')
+
+        for ext in config['image_types']:
+            graph.figure.savefig(f"images/{ext}/{dataset}/{Template(params['filebase']).safe_substitute(madlibs)}.{ext}",
+                                 dpi=config['dpi'], bbox_inches="tight")
+
+        plt.close(graph.figure)
+        print(f"{Template(params['filebase']).safe_substitute(madlibs)}.{ext}")
+
+
+###########################################
+
+
+def main():
+
+    config = toml.load("config.toml")
+
+
+    colormappings = load_color_cache(config['color_cache'],config['color_presets'])
+
+    database = sqlite3.connect(DATAFILE, detect_types=sqlite3.PARSE_DECLTYPES)
+
+    if 'timeseries' in config:
+        for timeseries in config['timeseries']:
+            params = config['timeseries_defaults'].copy()
+            params.update(timeseries)
+
+            query = f"""SELECT week, {params['dataseries']}, SUM(hits) as hits
+                    FROM checkins
+                    WHERE dataset =\"{params['dataset']}\"
+                    {params['extraselect']}
+                    GROUP BY week, {params['dataseries']}
+                    ORDER BY week
+                """
+            df = pd.read_sql_query(query, parse_dates='week',   con=database)
+
+            graph_timeseries(
+                config=config,
+                colormappings=colormappings,
+                params=params,
+                dataframe=df.pivot(index='week', columns=params['dataseries'],
+                                values='hits').astype("Int64"),
+            )
+
+    if 'byrelease' in config:
+        for byrelease in config['byrelease']:
+            params = config['byrelease_defaults'].copy()
+            params.update(byrelease)
+
+            query = f"""SELECT release, {params['dataseries']}, SUM(hits) as hits
+                    FROM peak
+                    WHERE dataset =\"{params['dataset']}\"
+                    {params['extraselect']}
+                    GROUP BY release, {params['dataseries']}
+                    ORDER BY release
+                """
+            df = pd.read_sql_query(query, con=database)
+
+            graph_releasebars(
+                config=config,
+                colormappings=colormappings,
+                params=params,
+                dataframe=df.pivot(index='release', columns=params['dataseries'],
+                                values='hits').astype("Int64"),
+            )
+
+
+'''
+### getting ahead of myself: this is for the waffle charts
+query = """SELECT * from checkins where week >= (SELECT DISTINCT(week) FROM checkins ORDER BY week DESC LIMIT 4,1)"""
+df = pd.read_sql_query(query, parse_dates='week', con=database)
+df
+'''
+
+if __name__ == "__main__":
+    main()
diff --git a/brontosaurus_slicer.py b/brontosaurus_slicer.py
new file mode 100755
index 0000000..8235990
--- /dev/null
+++ b/brontosaurus_slicer.py
@@ -0,0 +1,129 @@
+#!/bin/bash
+
+# This script implements the filter rules in NOTES.md, and splits
+# the totals.db into a new cleaned-up table in a "bronto.db" file.
+#
+# It splits the records into major groups: EPEL, and then also
+# "main" Fedora Linux systems, Fedora Rawhide, and Fedora Containers.
+# Because "group" is a reserved word in sql, we use "dataset".
+#
+# It removes the os_ prefix, because without repo_ columns there
+# is no ambiguity to resolve.
+#
+# And, it converts weeknums (which start on January 5th, 1970,
+# the first Monday of the epoch) to dates.
+#
+# We're dropping the os_version field from EL and instead just
+# using the first digit (e.g. "8" or "9") as "release". We are
+# also using _name_ as variant. We could keep these separate,
+# but this way we have the same fields for both types.
+
+
+sqlite3 db/totals.db << EOF
+
+ATTACH DATABASE 'db/bronto.db' AS bronto;
+
+DROP TABLE IF EXISTS bronto.checkins;
+
+CREATE TABLE bronto.checkins(
+  week INT,
+  dataset TEXT,
+  release TEXT,
+  variant TEXT,
+  arch TEXT,
+  age INT CHECK(age<5),
+  hits INT,
+  UNIQUE (week,dataset,release,variant,arch,age)
+);
+
+INSERT INTO bronto.checkins
+    SELECT date(julianday('1970-01-05')+ weeknum * 7) as week,
+           "fedora_updates_systems" AS dataset,
+           os_version AS release,
+           os_variant AS variant,
+           os_arch AS arch,
+           sys_age AS age,
+           SUM(hits)
+        FROM countme_totals
+        WHERE (os_name = 'Fedora' or os_name = 'Fedora Linux')
+        AND repo_tag LIKE 'updates-released-f__'
+        AND os_version = substr(repo_tag,-2,2)
+        AND os_arch = repo_arch
+        AND os_variant != 'container'
+        AND os_variant != 'toolbx'
+        AND os_variant != 'snappy'
+        GROUP BY week,release,variant,arch,age;
+
+INSERT INTO bronto.checkins
+    SELECT date(julianday('1970-01-05')+ weeknum * 7) as week,
+           "fedora_updates_containers" AS dataset,
+           os_version AS release,
+           os_variant AS variant,
+           os_arch AS arch,
+           sys_age AS age,
+           SUM(hits)
+        FROM countme_totals
+        WHERE (os_name = 'Fedora' or os_name = 'Fedora Linux')
+        AND repo_tag LIKE 'updates-released-f__'
+        AND os_version = substr(repo_tag,-2,2)
+        AND os_arch = repo_arch
+        AND (os_variant = 'container' OR
+             os_variant = 'toolbx' OR
+             os_variant = 'snappy')
+        GROUP BY week,release,variant,arch,age;
+
+
+INSERT INTO bronto.checkins
+    SELECT date(julianday('1970-01-05')+ weeknum * 7) as week,
+           "fedora_rawhide_systems" AS dataset,
+           os_version AS release,
+           os_variant AS variant,
+           os_arch AS arch,
+           sys_age AS age,
+           SUM(hits)
+        FROM countme_totals
+        WHERE (os_name = 'Fedora' or os_name = 'Fedora Linux')
+        AND repo_tag = 'fedora-rawhide' OR repo_tag = 'rawhide'
+        AND os_arch = repo_arch
+        AND os_variant != 'container'
+        AND os_variant != 'toolbx'
+        AND os_variant != 'snappy'
+        GROUP BY week,release,variant,arch,age;
+
+INSERT INTO bronto.checkins
+    SELECT date(julianday('1970-01-05')+ weeknum * 7) as week,
+           "fedora_rawhide_containers" AS dataset,
+           os_version AS release,
+           os_variant AS variant,
+           os_arch AS arch,
+           sys_age AS age,
+           SUM(hits)
+        FROM countme_totals
+        WHERE (os_name = 'Fedora' or os_name = 'Fedora Linux')
+        AND repo_tag  = 'rawhide'
+        AND os_arch = repo_arch
+        AND (os_variant = 'container' OR
+             os_variant = 'toolbx' OR
+             os_variant = 'snappy')
+        GROUP BY week,release,variant,arch,age;
+
+
+INSERT INTO bronto.checkins
+    SELECT date(julianday('1970-01-05')+ weeknum * 7) as week,
+           "epel" AS dataset,
+           CASE instr(os_version,".") 
+               WHEN 0 THEN os_version 
+               ELSE substr(os_version,0,instr(os_version,".")) 
+           END AS release,
+           os_name AS variant,
+           os_arch AS arch,
+           sys_age AS age,
+           SUM(hits)
+           FROM countme_totals
+           WHERE repo_tag = 'epel-' || release
+           AND os_arch = repo_arch
+           GROUP BY week,release,variant,arch,age;
+
+DETACH bronto;
+
+EOF
diff --git a/brontosaurus_washer.py b/brontosaurus_washer.py
new file mode 100755
index 0000000..b655cb7
--- /dev/null
+++ b/brontosaurus_washer.py
@@ -0,0 +1,69 @@
+#!/bin/bash
+#
+# For every table in bronto.db, delete "known bad" variants.
+#
+
+
+# Please document each new reason for cleaning something here, so we know
+# why later.
+#
+# The variants 09975472-cc15-4020-9231-cc6743a15b0f and c1e9af0e-5816-4644-9c89-703aa1fdcf04
+# are strings I (mattdm) used for testing.
+#
+# The variant "CentOS Stream v21.*" is some sort of horrible scripted thing
+# apparently, where it keeps getting longer and longer with additional tags
+#
+# Also, for each table, sets any variant that is '' to 'none', because
+# '' is hard to work with. (I think this is when people have manually put
+# "VARIANT_ID=", as opposed to not having one. I don't think that's useful
+# to track separately from 'generic', really, so an alternative would be
+# to merge them... but doing this for now.)
+# FIXME: this needs to merge them in case someone starts actually sending
+# "none" as the string — we'll get a uniqueness constraint violation.
+#
+# This is a regex, in case that's not clear.
+GARBAGE_VARIANTS="^(09975472-cc15-4020-9231-cc6743a15b0f|c1e9af0e-5816-4644-9c89-703aa1fdcf04|CentOS Stream v21.*)$" 
+
+sqlite3 db/bronto.db << EOF
+  DELETE FROM checkins WHERE variant REGEXP "$GARBAGE_VARIANTS";
+  UPDATE checkins SET variant='none' WHERE variant='';
+EOF
+
+
+# While some test systems ran Fedora Linux 31, the feature landed
+# in 32 (released 2020-04-27, so drop all the old stuff.
+FEDORA_STARTVER=32
+FEDORA_STARTDAY='2021-01-01'
+# And same for EPEL, with EL 8 (DNF feature launched in 8.3 at end of 2020)
+EPEL_STARTVER=8
+EPEL_STARTDAY='2021-01-01'
+sqlite3 db/bronto.db << EOF
+  DELETE FROM checkins WHERE dataset GLOB "fedora*" AND release < $FEDORA_STARTVER;
+  DELETE FROM checkins WHERE dataset GLOB "fedora*" AND week < "$FEDORA_STARTDAY";
+  DELETE FROM checkins WHERE dataset GLOB "epel*"   AND release < $EPEL_STARTVER;
+  DELETE FROM checkins WHERE dataset GLOB "epel*"   AND week < "$EPEL_STARTDAY";
+EOF
+
+# Clean up entries for name, arch, or release that show up
+# with less than some threshold in _total_ hits in the
+# whole database, or where the weekly numbers never exceed some
+# small value. This removes both small bursts of nonsense
+# and also most long-lived singletons. We may want to revisit
+# what these are set to when we have more data.
+#
+# Note that since we regenerate the whole db from totals.db 
+# each week, if something exceeds this threshold later, it will
+# suddenly appear
+THRESHOLD_TOTAL=100
+THRESHOLD_WEEKLY=3
+
+for GROUP in $(echo 'SELECT DISTINCT(dataset) FROM checkins;' | sqlite3 ./db/bronto.db); do
+   sqlite3 db/bronto.db << EOF
+     DELETE FROM checkins WHERE dataset = "$GROUP" AND variant IN (SELECT variant FROM checkins WHERE dataset = "$GROUP" GROUP BY variant HAVING SUM(hits) < $THRESHOLD_TOTAL);
+     DELETE FROM checkins WHERE dataset = "$GROUP" AND variant IN (SELECT variant FROM checkins WHERE dataset = "$GROUP" GROUP BY variant HAVING SUM(hits) < $THRESHOLD_WEEKLY);
+     DELETE FROM checkins WHERE dataset = "$GROUP" AND arch    IN (SELECT arch    FROM checkins WHERE dataset = "$GROUP" GROUP BY arch    HAVING SUM(hits) < $THRESHOLD_TOTAL);
+     DELETE FROM checkins WHERE dataset = "$GROUP" AND arch    IN (SELECT arch    FROM checkins WHERE dataset = "$GROUP" GROUP BY arch    HAVING SUM(hits) < $THRESHOLD_WEEKLY);
+     DELETE FROM checkins WHERE dataset = "$GROUP" AND release IN (SELECT release FROM checkins WHERE dataset = "$GROUP" GROUP BY release HAVING SUM(hits) < $THRESHOLD_TOTAL);
+     DELETE FROM checkins WHERE dataset = "$GROUP" AND release IN (SELECT release FROM checkins WHERE dataset = "$GROUP" GROUP BY release HAVING SUM(hits) < $THRESHOLD_WEEKLY);
+EOF
+done
diff --git a/run.sh b/run.sh
index 09b84b5..02432b0 100755
--- a/run.sh
+++ b/run.sh
@@ -47,7 +47,7 @@ echo -n "* Fossilizing ancient images... "
 echo "  buried."
 
 echo -n "* Slicing brontosauruses... "
-  ./brontosaurus-slicer.sh
+  ./brontosaurus_slicer.sh
   if [[ $? != 0 ]]; then
     echo "! Oops."
     exit 1
@@ -55,7 +55,7 @@ echo -n "* Slicing brontosauruses... "
 echo "  into bits."
 
 echo -n "* Scrubbing off the dirt... "
-  ./brontosaurus-washer.sh
+  ./brontosaurus_washer.sh
   if [[ $? != 0 ]]; then
     echo "! Oops."
     exit 1
@@ -63,7 +63,7 @@ echo -n "* Scrubbing off the dirt... "
 echo "  shiny!"
 
 echo -n "* Finding the strongest... "
-  ./brontosaurus-fight.sh
+  ./brontosaurus_fight.sh
   if [[ $? != 0 ]]; then
     echo "! Oops."
     exit 1
@@ -71,7 +71,7 @@ echo -n "* Finding the strongest... "
 echo "  rarrhhhhr!"
 
 echo -n "* Sorting the eggs... "
-  ./brontosaurus-egg-sorter.py
+  ./brontosaurus_egg-sorter.py
   if [[ $? != 0 ]]; then
     echo "! Oops."
     exit 1
@@ -86,12 +86,12 @@ echo "  Built!"
 
 echo "* Painting the feathers..."
  rm db/color-cache.toml 2> /dev/null
- ./brontosaurus-colorizer.py
+ ./brontosaurus_colorizer.py
 echo "  Vibrant!"
 
 echo "* Drawing portraits from the fossilized remains... "
  LINES=$(toml -dj < config.toml |jq '.timeseries[].views.value[].value'|wc -l)
- ./brontosaurus-plotter.py | pv -F "  %p %e" -w60 -l -s $LINES > /dev/null
+ ./brontosaurus_plotter.py | pv -F "  %p %e" -w60 -l -s $LINES > /dev/null
   if [[ $? != 0 ]]; then
     echo "! Oops."
     exit 1

From 2701b4551a80acd5dda6639d71df5869bf7e9b52 Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 23 2022 21:56:02 +0000
Subject: [PATCH 16/49] um, yeah, python still :)


---

diff --git a/brontosaurus_slicer.py b/brontosaurus_slicer.py
deleted file mode 100755
index 8235990..0000000
--- a/brontosaurus_slicer.py
+++ /dev/null
@@ -1,129 +0,0 @@
-#!/bin/bash
-
-# This script implements the filter rules in NOTES.md, and splits
-# the totals.db into a new cleaned-up table in a "bronto.db" file.
-#
-# It splits the records into major groups: EPEL, and then also
-# "main" Fedora Linux systems, Fedora Rawhide, and Fedora Containers.
-# Because "group" is a reserved word in sql, we use "dataset".
-#
-# It removes the os_ prefix, because without repo_ columns there
-# is no ambiguity to resolve.
-#
-# And, it converts weeknums (which start on January 5th, 1970,
-# the first Monday of the epoch) to dates.
-#
-# We're dropping the os_version field from EL and instead just
-# using the first digit (e.g. "8" or "9") as "release". We are
-# also using _name_ as variant. We could keep these separate,
-# but this way we have the same fields for both types.
-
-
-sqlite3 db/totals.db << EOF
-
-ATTACH DATABASE 'db/bronto.db' AS bronto;
-
-DROP TABLE IF EXISTS bronto.checkins;
-
-CREATE TABLE bronto.checkins(
-  week INT,
-  dataset TEXT,
-  release TEXT,
-  variant TEXT,
-  arch TEXT,
-  age INT CHECK(age<5),
-  hits INT,
-  UNIQUE (week,dataset,release,variant,arch,age)
-);
-
-INSERT INTO bronto.checkins
-    SELECT date(julianday('1970-01-05')+ weeknum * 7) as week,
-           "fedora_updates_systems" AS dataset,
-           os_version AS release,
-           os_variant AS variant,
-           os_arch AS arch,
-           sys_age AS age,
-           SUM(hits)
-        FROM countme_totals
-        WHERE (os_name = 'Fedora' or os_name = 'Fedora Linux')
-        AND repo_tag LIKE 'updates-released-f__'
-        AND os_version = substr(repo_tag,-2,2)
-        AND os_arch = repo_arch
-        AND os_variant != 'container'
-        AND os_variant != 'toolbx'
-        AND os_variant != 'snappy'
-        GROUP BY week,release,variant,arch,age;
-
-INSERT INTO bronto.checkins
-    SELECT date(julianday('1970-01-05')+ weeknum * 7) as week,
-           "fedora_updates_containers" AS dataset,
-           os_version AS release,
-           os_variant AS variant,
-           os_arch AS arch,
-           sys_age AS age,
-           SUM(hits)
-        FROM countme_totals
-        WHERE (os_name = 'Fedora' or os_name = 'Fedora Linux')
-        AND repo_tag LIKE 'updates-released-f__'
-        AND os_version = substr(repo_tag,-2,2)
-        AND os_arch = repo_arch
-        AND (os_variant = 'container' OR
-             os_variant = 'toolbx' OR
-             os_variant = 'snappy')
-        GROUP BY week,release,variant,arch,age;
-
-
-INSERT INTO bronto.checkins
-    SELECT date(julianday('1970-01-05')+ weeknum * 7) as week,
-           "fedora_rawhide_systems" AS dataset,
-           os_version AS release,
-           os_variant AS variant,
-           os_arch AS arch,
-           sys_age AS age,
-           SUM(hits)
-        FROM countme_totals
-        WHERE (os_name = 'Fedora' or os_name = 'Fedora Linux')
-        AND repo_tag = 'fedora-rawhide' OR repo_tag = 'rawhide'
-        AND os_arch = repo_arch
-        AND os_variant != 'container'
-        AND os_variant != 'toolbx'
-        AND os_variant != 'snappy'
-        GROUP BY week,release,variant,arch,age;
-
-INSERT INTO bronto.checkins
-    SELECT date(julianday('1970-01-05')+ weeknum * 7) as week,
-           "fedora_rawhide_containers" AS dataset,
-           os_version AS release,
-           os_variant AS variant,
-           os_arch AS arch,
-           sys_age AS age,
-           SUM(hits)
-        FROM countme_totals
-        WHERE (os_name = 'Fedora' or os_name = 'Fedora Linux')
-        AND repo_tag  = 'rawhide'
-        AND os_arch = repo_arch
-        AND (os_variant = 'container' OR
-             os_variant = 'toolbx' OR
-             os_variant = 'snappy')
-        GROUP BY week,release,variant,arch,age;
-
-
-INSERT INTO bronto.checkins
-    SELECT date(julianday('1970-01-05')+ weeknum * 7) as week,
-           "epel" AS dataset,
-           CASE instr(os_version,".") 
-               WHEN 0 THEN os_version 
-               ELSE substr(os_version,0,instr(os_version,".")) 
-           END AS release,
-           os_name AS variant,
-           os_arch AS arch,
-           sys_age AS age,
-           SUM(hits)
-           FROM countme_totals
-           WHERE repo_tag = 'epel-' || release
-           AND os_arch = repo_arch
-           GROUP BY week,release,variant,arch,age;
-
-DETACH bronto;
-
-EOF
diff --git a/brontosaurus_slicer.sh b/brontosaurus_slicer.sh
new file mode 100755
index 0000000..8235990
--- /dev/null
+++ b/brontosaurus_slicer.sh
@@ -0,0 +1,129 @@
+#!/bin/bash
+
+# This script implements the filter rules in NOTES.md, and splits
+# the totals.db into a new cleaned-up table in a "bronto.db" file.
+#
+# It splits the records into major groups: EPEL, and then also
+# "main" Fedora Linux systems, Fedora Rawhide, and Fedora Containers.
+# Because "group" is a reserved word in sql, we use "dataset".
+#
+# It removes the os_ prefix, because without repo_ columns there
+# is no ambiguity to resolve.
+#
+# And, it converts weeknums (which start on January 5th, 1970,
+# the first Monday of the epoch) to dates.
+#
+# We're dropping the os_version field from EL and instead just
+# using the first digit (e.g. "8" or "9") as "release". We are
+# also using _name_ as variant. We could keep these separate,
+# but this way we have the same fields for both types.
+
+
+sqlite3 db/totals.db << EOF
+
+ATTACH DATABASE 'db/bronto.db' AS bronto;
+
+DROP TABLE IF EXISTS bronto.checkins;
+
+CREATE TABLE bronto.checkins(
+  week INT,
+  dataset TEXT,
+  release TEXT,
+  variant TEXT,
+  arch TEXT,
+  age INT CHECK(age<5),
+  hits INT,
+  UNIQUE (week,dataset,release,variant,arch,age)
+);
+
+INSERT INTO bronto.checkins
+    SELECT date(julianday('1970-01-05')+ weeknum * 7) as week,
+           "fedora_updates_systems" AS dataset,
+           os_version AS release,
+           os_variant AS variant,
+           os_arch AS arch,
+           sys_age AS age,
+           SUM(hits)
+        FROM countme_totals
+        WHERE (os_name = 'Fedora' or os_name = 'Fedora Linux')
+        AND repo_tag LIKE 'updates-released-f__'
+        AND os_version = substr(repo_tag,-2,2)
+        AND os_arch = repo_arch
+        AND os_variant != 'container'
+        AND os_variant != 'toolbx'
+        AND os_variant != 'snappy'
+        GROUP BY week,release,variant,arch,age;
+
+INSERT INTO bronto.checkins
+    SELECT date(julianday('1970-01-05')+ weeknum * 7) as week,
+           "fedora_updates_containers" AS dataset,
+           os_version AS release,
+           os_variant AS variant,
+           os_arch AS arch,
+           sys_age AS age,
+           SUM(hits)
+        FROM countme_totals
+        WHERE (os_name = 'Fedora' or os_name = 'Fedora Linux')
+        AND repo_tag LIKE 'updates-released-f__'
+        AND os_version = substr(repo_tag,-2,2)
+        AND os_arch = repo_arch
+        AND (os_variant = 'container' OR
+             os_variant = 'toolbx' OR
+             os_variant = 'snappy')
+        GROUP BY week,release,variant,arch,age;
+
+
+INSERT INTO bronto.checkins
+    SELECT date(julianday('1970-01-05')+ weeknum * 7) as week,
+           "fedora_rawhide_systems" AS dataset,
+           os_version AS release,
+           os_variant AS variant,
+           os_arch AS arch,
+           sys_age AS age,
+           SUM(hits)
+        FROM countme_totals
+        WHERE (os_name = 'Fedora' or os_name = 'Fedora Linux')
+        AND repo_tag = 'fedora-rawhide' OR repo_tag = 'rawhide'
+        AND os_arch = repo_arch
+        AND os_variant != 'container'
+        AND os_variant != 'toolbx'
+        AND os_variant != 'snappy'
+        GROUP BY week,release,variant,arch,age;
+
+INSERT INTO bronto.checkins
+    SELECT date(julianday('1970-01-05')+ weeknum * 7) as week,
+           "fedora_rawhide_containers" AS dataset,
+           os_version AS release,
+           os_variant AS variant,
+           os_arch AS arch,
+           sys_age AS age,
+           SUM(hits)
+        FROM countme_totals
+        WHERE (os_name = 'Fedora' or os_name = 'Fedora Linux')
+        AND repo_tag  = 'rawhide'
+        AND os_arch = repo_arch
+        AND (os_variant = 'container' OR
+             os_variant = 'toolbx' OR
+             os_variant = 'snappy')
+        GROUP BY week,release,variant,arch,age;
+
+
+INSERT INTO bronto.checkins
+    SELECT date(julianday('1970-01-05')+ weeknum * 7) as week,
+           "epel" AS dataset,
+           CASE instr(os_version,".") 
+               WHEN 0 THEN os_version 
+               ELSE substr(os_version,0,instr(os_version,".")) 
+           END AS release,
+           os_name AS variant,
+           os_arch AS arch,
+           sys_age AS age,
+           SUM(hits)
+           FROM countme_totals
+           WHERE repo_tag = 'epel-' || release
+           AND os_arch = repo_arch
+           GROUP BY week,release,variant,arch,age;
+
+DETACH bronto;
+
+EOF
diff --git a/brontosaurus_washer.py b/brontosaurus_washer.py
deleted file mode 100755
index b655cb7..0000000
--- a/brontosaurus_washer.py
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/bin/bash
-#
-# For every table in bronto.db, delete "known bad" variants.
-#
-
-
-# Please document each new reason for cleaning something here, so we know
-# why later.
-#
-# The variants 09975472-cc15-4020-9231-cc6743a15b0f and c1e9af0e-5816-4644-9c89-703aa1fdcf04
-# are strings I (mattdm) used for testing.
-#
-# The variant "CentOS Stream v21.*" is some sort of horrible scripted thing
-# apparently, where it keeps getting longer and longer with additional tags
-#
-# Also, for each table, sets any variant that is '' to 'none', because
-# '' is hard to work with. (I think this is when people have manually put
-# "VARIANT_ID=", as opposed to not having one. I don't think that's useful
-# to track separately from 'generic', really, so an alternative would be
-# to merge them... but doing this for now.)
-# FIXME: this needs to merge them in case someone starts actually sending
-# "none" as the string — we'll get a uniqueness constraint violation.
-#
-# This is a regex, in case that's not clear.
-GARBAGE_VARIANTS="^(09975472-cc15-4020-9231-cc6743a15b0f|c1e9af0e-5816-4644-9c89-703aa1fdcf04|CentOS Stream v21.*)$" 
-
-sqlite3 db/bronto.db << EOF
-  DELETE FROM checkins WHERE variant REGEXP "$GARBAGE_VARIANTS";
-  UPDATE checkins SET variant='none' WHERE variant='';
-EOF
-
-
-# While some test systems ran Fedora Linux 31, the feature landed
-# in 32 (released 2020-04-27, so drop all the old stuff.
-FEDORA_STARTVER=32
-FEDORA_STARTDAY='2021-01-01'
-# And same for EPEL, with EL 8 (DNF feature launched in 8.3 at end of 2020)
-EPEL_STARTVER=8
-EPEL_STARTDAY='2021-01-01'
-sqlite3 db/bronto.db << EOF
-  DELETE FROM checkins WHERE dataset GLOB "fedora*" AND release < $FEDORA_STARTVER;
-  DELETE FROM checkins WHERE dataset GLOB "fedora*" AND week < "$FEDORA_STARTDAY";
-  DELETE FROM checkins WHERE dataset GLOB "epel*"   AND release < $EPEL_STARTVER;
-  DELETE FROM checkins WHERE dataset GLOB "epel*"   AND week < "$EPEL_STARTDAY";
-EOF
-
-# Clean up entries for name, arch, or release that show up
-# with less than some threshold in _total_ hits in the
-# whole database, or where the weekly numbers never exceed some
-# small value. This removes both small bursts of nonsense
-# and also most long-lived singletons. We may want to revisit
-# what these are set to when we have more data.
-#
-# Note that since we regenerate the whole db from totals.db 
-# each week, if something exceeds this threshold later, it will
-# suddenly appear
-THRESHOLD_TOTAL=100
-THRESHOLD_WEEKLY=3
-
-for GROUP in $(echo 'SELECT DISTINCT(dataset) FROM checkins;' | sqlite3 ./db/bronto.db); do
-   sqlite3 db/bronto.db << EOF
-     DELETE FROM checkins WHERE dataset = "$GROUP" AND variant IN (SELECT variant FROM checkins WHERE dataset = "$GROUP" GROUP BY variant HAVING SUM(hits) < $THRESHOLD_TOTAL);
-     DELETE FROM checkins WHERE dataset = "$GROUP" AND variant IN (SELECT variant FROM checkins WHERE dataset = "$GROUP" GROUP BY variant HAVING SUM(hits) < $THRESHOLD_WEEKLY);
-     DELETE FROM checkins WHERE dataset = "$GROUP" AND arch    IN (SELECT arch    FROM checkins WHERE dataset = "$GROUP" GROUP BY arch    HAVING SUM(hits) < $THRESHOLD_TOTAL);
-     DELETE FROM checkins WHERE dataset = "$GROUP" AND arch    IN (SELECT arch    FROM checkins WHERE dataset = "$GROUP" GROUP BY arch    HAVING SUM(hits) < $THRESHOLD_WEEKLY);
-     DELETE FROM checkins WHERE dataset = "$GROUP" AND release IN (SELECT release FROM checkins WHERE dataset = "$GROUP" GROUP BY release HAVING SUM(hits) < $THRESHOLD_TOTAL);
-     DELETE FROM checkins WHERE dataset = "$GROUP" AND release IN (SELECT release FROM checkins WHERE dataset = "$GROUP" GROUP BY release HAVING SUM(hits) < $THRESHOLD_WEEKLY);
-EOF
-done
diff --git a/brontosaurus_washer.sh b/brontosaurus_washer.sh
new file mode 100755
index 0000000..b655cb7
--- /dev/null
+++ b/brontosaurus_washer.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+#
+# For every table in bronto.db, delete "known bad" variants.
+#
+
+
+# Please document each new reason for cleaning something here, so we know
+# why later.
+#
+# The variants 09975472-cc15-4020-9231-cc6743a15b0f and c1e9af0e-5816-4644-9c89-703aa1fdcf04
+# are strings I (mattdm) used for testing.
+#
+# The variant "CentOS Stream v21.*" is some sort of horrible scripted thing
+# apparently, where it keeps getting longer and longer with additional tags
+#
+# Also, for each table, sets any variant that is '' to 'none', because
+# '' is hard to work with. (I think this is when people have manually put
+# "VARIANT_ID=", as opposed to not having one. I don't think that's useful
+# to track separately from 'generic', really, so an alternative would be
+# to merge them... but doing this for now.)
+# FIXME: this needs to merge them in case someone starts actually sending
+# "none" as the string — we'll get a uniqueness constraint violation.
+#
+# This is a regex, in case that's not clear.
+GARBAGE_VARIANTS="^(09975472-cc15-4020-9231-cc6743a15b0f|c1e9af0e-5816-4644-9c89-703aa1fdcf04|CentOS Stream v21.*)$" 
+
+sqlite3 db/bronto.db << EOF
+  DELETE FROM checkins WHERE variant REGEXP "$GARBAGE_VARIANTS";
+  UPDATE checkins SET variant='none' WHERE variant='';
+EOF
+
+
+# While some test systems ran Fedora Linux 31, the feature landed
+# in 32 (released 2020-04-27, so drop all the old stuff.
+FEDORA_STARTVER=32
+FEDORA_STARTDAY='2021-01-01'
+# And same for EPEL, with EL 8 (DNF feature launched in 8.3 at end of 2020)
+EPEL_STARTVER=8
+EPEL_STARTDAY='2021-01-01'
+sqlite3 db/bronto.db << EOF
+  DELETE FROM checkins WHERE dataset GLOB "fedora*" AND release < $FEDORA_STARTVER;
+  DELETE FROM checkins WHERE dataset GLOB "fedora*" AND week < "$FEDORA_STARTDAY";
+  DELETE FROM checkins WHERE dataset GLOB "epel*"   AND release < $EPEL_STARTVER;
+  DELETE FROM checkins WHERE dataset GLOB "epel*"   AND week < "$EPEL_STARTDAY";
+EOF
+
+# Clean up entries for name, arch, or release that show up
+# with less than some threshold in _total_ hits in the
+# whole database, or where the weekly numbers never exceed some
+# small value. This removes both small bursts of nonsense
+# and also most long-lived singletons. We may want to revisit
+# what these are set to when we have more data.
+#
+# Note that since we regenerate the whole db from totals.db 
+# each week, if something exceeds this threshold later, it will
+# suddenly appear
+THRESHOLD_TOTAL=100
+THRESHOLD_WEEKLY=3
+
+for GROUP in $(echo 'SELECT DISTINCT(dataset) FROM checkins;' | sqlite3 ./db/bronto.db); do
+   sqlite3 db/bronto.db << EOF
+     DELETE FROM checkins WHERE dataset = "$GROUP" AND variant IN (SELECT variant FROM checkins WHERE dataset = "$GROUP" GROUP BY variant HAVING SUM(hits) < $THRESHOLD_TOTAL);
+     DELETE FROM checkins WHERE dataset = "$GROUP" AND variant IN (SELECT variant FROM checkins WHERE dataset = "$GROUP" GROUP BY variant HAVING SUM(hits) < $THRESHOLD_WEEKLY);
+     DELETE FROM checkins WHERE dataset = "$GROUP" AND arch    IN (SELECT arch    FROM checkins WHERE dataset = "$GROUP" GROUP BY arch    HAVING SUM(hits) < $THRESHOLD_TOTAL);
+     DELETE FROM checkins WHERE dataset = "$GROUP" AND arch    IN (SELECT arch    FROM checkins WHERE dataset = "$GROUP" GROUP BY arch    HAVING SUM(hits) < $THRESHOLD_WEEKLY);
+     DELETE FROM checkins WHERE dataset = "$GROUP" AND release IN (SELECT release FROM checkins WHERE dataset = "$GROUP" GROUP BY release HAVING SUM(hits) < $THRESHOLD_TOTAL);
+     DELETE FROM checkins WHERE dataset = "$GROUP" AND release IN (SELECT release FROM checkins WHERE dataset = "$GROUP" GROUP BY release HAVING SUM(hits) < $THRESHOLD_WEEKLY);
+EOF
+done

From b5f0debf73559f007deed675e1b6221e21f9eae7 Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 23 2022 21:56:36 +0000
Subject: [PATCH 17/49] one line, sure


---

diff --git a/run.sh b/run.sh
index 02432b0..a8d85a0 100755
--- a/run.sh
+++ b/run.sh
@@ -84,10 +84,10 @@ echo "* Creating cages for different exhibits..."
   done
 echo "  Built!"
 
-echo "* Painting the feathers..."
+echo -n "* Painting the feathers..."
  rm db/color-cache.toml 2> /dev/null
  ./brontosaurus_colorizer.py
-echo "  Vibrant!"
+echo "  vibrant!"
 
 echo "* Drawing portraits from the fossilized remains... "
  LINES=$(toml -dj < config.toml |jq '.timeseries[].views.value[].value'|wc -l)

From 3e01bc96db958bcb841dc5436376f30915ce35cd Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 23 2022 22:49:03 +0000
Subject: [PATCH 18/49] maybe I'm just hitting myself in the face making the cache be toml?


---

diff --git a/brontosaurus_colorizer.py b/brontosaurus_colorizer.py
index 878f4dc..e3870f4 100755
--- a/brontosaurus_colorizer.py
+++ b/brontosaurus_colorizer.py
@@ -15,7 +15,7 @@ from collections import OrderedDict
 import toml
 import re
 
-
+from deepmerge import always_merger
 
 
 def get_colors(colormappings, colorlist, dataset, dataseries, items):
@@ -28,8 +28,9 @@ def get_colors(colormappings, colorlist, dataset, dataseries, items):
     outcolors = []
     for item in items:
         if item not in colormappings[key]:
-            colormappings[key][str(item)] = colorlist[len(colormappings[key]) % len(colorlist)]
-        outcolors.append(colormappings[key][str(item)])
+            print(f"Miss {item} in {key}")
+            colormappings[key][item] = colorlist[len(colormappings[key]) % len(colorlist)]
+        outcolors.append(colormappings[key][item])
 
     return outcolors
 
@@ -45,7 +46,7 @@ def load_color_cache(cachefile,presetfile):
 
     try:
         presets = toml.load(presetfile)
-        cached.update(presets)
+        always_merger.merge(cached,presets)
     except FileNotFoundError:
         print(f"No color preset file {presetfile} found.")
         
diff --git a/color-presets.toml b/color-presets.toml
index dd54edb..8c0daf7 100644
--- a/color-presets.toml
+++ b/color-presets.toml
@@ -1,8 +1,8 @@
 ["epel.variant"]
-"CentOS Linux" = "#101010"
+"CentOS Linux" = "#808080"
 "Red Hat Enterprise Linux" = "#ee0000"
 "CentOS Stream" = "#a14a8c"
 "Rocky Linux" = "#10b981"
 "AlmaLinux" = "#ffcc0a"
-"Oracle Linux Server" = "#aaaaaa"
+"Oracle Linux Server" = "#101010"
 "CloudLinux" = "#0097f3"

From c1080ba0f8906845e8dfc41ad6de60b21b50ea50 Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 24 2022 00:07:37 +0000
Subject: [PATCH 19/49] there now toml works


---

diff --git a/brontosaurus_colorizer.py b/brontosaurus_colorizer.py
index e3870f4..c39cb4f 100755
--- a/brontosaurus_colorizer.py
+++ b/brontosaurus_colorizer.py
@@ -27,10 +27,10 @@ def get_colors(colormappings, colorlist, dataset, dataseries, items):
     # and save that for later.
     outcolors = []
     for item in items:
-        if item not in colormappings[key]:
+        if str(item) not in colormappings[key]:
             print(f"Miss {item} in {key}")
-            colormappings[key][item] = colorlist[len(colormappings[key]) % len(colorlist)]
-        outcolors.append(colormappings[key][item])
+            colormappings[key][str(item)] = colorlist[len(colormappings[key]) % len(colorlist)]
+        outcolors.append(colormappings[key][str(item)])
 
     return outcolors
 
@@ -54,7 +54,10 @@ def load_color_cache(cachefile,presetfile):
     # gotta do this because we want a defaultdict but
     # toml load just gives us a regular dict.
     for key in cached.keys():
-        colormappings[key] = cached[key].copy()
+        # and this because we want the item keys to be strings
+        # even if they look like integers
+        for (item,color) in cached[key].items():
+            colormappings[key][str(item)] = cached[key][item]
 
     return colormappings
         

From cc15d2400a73e80ec9539a3e597f0205e0540297 Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 24 2022 01:35:59 +0000
Subject: [PATCH 20/49] good enough for now!


---

diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py
index 29ca465..46d972c 100755
--- a/brontosaurus_plotter.py
+++ b/brontosaurus_plotter.py
@@ -1,4 +1,17 @@
 #!/usr/bin/python3
+"""
+Brontosaurus Plotter
+
+Usage:
+  brontosaurus_plotter.py timeseries ( line | stacked | share ) <dataset> <dataseries> [--exclude <arg>...|--include <arg>...] [--cutoff <n>]
+  brontosaurus_plotter.py byrelease ( stacked | split ) <dataset> <dataseries> [--exclude <arg>...|--include <arg>...] [--cutoff <n>]
+  brontosaurus_plotter.py show <dataset> <dataseries>
+
+Options:
+  --exclude <arg>...  In the form `dataseries:element`. Can repeat.
+  --include <arg>...  As above, but include _only_ these.
+  --cutoff <n>        Drop items where the dataseries has less than n total hits
+"""
 
 import matplotlib.dates as dates
 import matplotlib.pyplot as plt
@@ -13,6 +26,8 @@ import toml
 
 import matplotlib as m
 
+from docopt import docopt
+
 from brontosaurus_colorizer import load_color_cache, get_colors
 
 DATAFILE = 'db/bronto.db'
@@ -27,113 +42,10 @@ m.rcParams['legend.frameon'] = False
 
 
-def graph_timeseries(config, colormappings, params, dataframe):
-    """Draws line or area chart for a dataseries over time."""
-
-    # If we find we have missing data, in the future:
-    # dataframe.resample('W-MON')
-
-    dataset = params['dataset']
-    dataseries = params['dataseries']
-
-    #################
-    # Instead of this, accumulate anything more than 10 into "other"
-    # ... and do it _elsewhere_ (easier to do before pivot anyway!)
-    #  + limit number of columns to 10 + other
-
-    hidelist = dataframe.div(dataframe.sum(
-        axis=1), axis=0).max() < 0.2/100
-    dataframe.drop(columns=(hidelist[hidelist == True].keys()), inplace=True)
-
-    ##################
-    # our colors.
-    # the complication here is keeping the same color for the same label
-    # across multiple graphs!
-    cmap = m.colors.ListedColormap(get_colors(colormappings, config['colors'],
-                                              dataset, dataseries, list(dataframe.columns)))
-
-    ##################
-    # and now.... graph it!
-
-    # FIXME: this is ugly
-    startdate = config['startdate'][dataset.split('_', 1)[0]]
-
-    for view in params['views']:
-
-        match view:
-            case 'line':
-                df = dataframe[startdate:]
-                kind = 'line'
-                colormap = cmap
-            case 'stacked':
-                df = dataframe[startdate:][dataframe.columns[::-1]]
-                kind = 'area'
-                colormap = m.colors.ListedColormap(cmap.colors[::-1])
-            case 'share':
-                df = dataframe[startdate:].div(
-                    dataframe.sum(axis=1), axis=0)*100
-                kind = 'area'
-                colormap = cmap
-
-                # Start the actual graph
-        graph = df.plot(figsize=config['figsize'],
-                        colormap=colormap, kind=kind)
-
-        # Labels and titles and stuff.
-        ax = plt.gca()
-
-        handles, labels = ax.get_legend_handles_labels()
-
-        # TODO: generalize this
-        if dataseries == 'age':
-            labels = list(map(config['age_labels'].get, labels))
-
-        if view == 'stacked':
-            handles[:] = handles[::-1]
-            labels[:] = labels[::-1]
-
-        plt.legend(handles, labels, loc='center left',
-                   bbox_to_anchor=(1.0, 0.5))
-
-        madlibs = {'dataseries': dataseries,
-                   'dataset': dataset,
-                   'view': view,
-                   'dataseries_label': config['dataseries_labels'][dataseries],
-                   'dataset_label': config['dataset_labels'][dataset],
-                   'view_label': config['view_labels'][view]}
-
-        if 'title' in params:
-            plt.suptitle(Template(params['title']).safe_substitute(madlibs),
-                         fontsize=24)
-
-        # FIX: make work
-        if 'subtitle' in params:
-            graph.set_title(
-                Template(params['subtitle']).safe_substitute(madlibs),
-                fontsize=14)
 
-        plt.autoscale(enable=True, axis='x', tight=True)
-        plt.autoscale(enable=True, axis='y', tight=False)
-        graph.set_ylim([0, None])
-        graph.spines['right'].set_visible(False)
-        graph.spines['top'].set_visible(False)
-        sFormatter = m.ticker.ScalarFormatter()
-        sFormatter.set_scientific(False)
-        graph.yaxis.set_major_formatter(sFormatter)
-        # graph.xaxis.set_major_formatter(dates.DateFormatter('%b \'%y'))
-        graph.set_xlabel('')
-
-        for ext in config['image_types']:
-            graph.figure.savefig(f"images/{ext}/{dataset}/{Template(params['filebase']).safe_substitute(madlibs)}.{ext}",
-                                 dpi=config['dpi'], bbox_inches="tight")
-
-        plt.close(graph.figure)
-        print(f"{Template(params['filebase']).safe_substitute(madlibs)}.{ext}")
-
-
-def graph_releasebars(config, colormappings, params, dataframe):
-    """Draws each release in the set as a bar chart"""
 
+def graph(config, colormappings, params, dataframe):
+    
     # If we find we have missing data, in the future:
     # dataframe.resample('W-MON')
 
@@ -162,73 +74,89 @@ def graph_releasebars(config, colormappings, params, dataframe):
     # FIXME: this is ugly
     startdate = config['startdate'][dataset.split('_', 1)[0]]
 
-    for view in params['views']:
-
-        match view:
-            case 'stacked':
-                df = dataframe[startdate:]
-                kind = 'bar'
-                colormap = cmap
-            case 'share':
+    stacked = True
+
+    match params['type']:
+        case 'byrelease':
+            kind = 'bar'
+        case 'timeseries':
+            kind = 'area'
+    
+    match params['view']:
+        case 'line':
+            df = dataframe[startdate:]
+            kind = 'line' # overrides 'area'
+            stacked = False # true everywhere else!
+            colormap = cmap
+        case 'stacked':
+            df = dataframe[startdate:][dataframe.columns[::-1]]
+            colormap = m.colors.ListedColormap(cmap.colors[::-1])
+        case 'share':
+            if dataseries == 'age':
+                # lower numbers are newer!
+                df = dataframe[startdate:][dataframe.columns[::-1]].div(
+                    dataframe.sum(axis=1), axis=0)*100
+                colormap = m.colors.ListedColormap(cmap.colors[::-1])
+            else:
+                # todo: sort arch and variant by popularity, not name!
                 df = dataframe[startdate:].div(
                     dataframe.sum(axis=1), axis=0)*100
-                kind = 'bar'
                 colormap = cmap
 
-        # Start the actual graph
-        graph = df.plot(figsize=config['figsize'],
-                        colormap=colormap, kind=kind, stacked=True)
+    # Start the actual graph
+    graph = df.plot(figsize=config['figsize'],
+                    colormap=colormap, kind=kind, stacked=stacked)
 
-        # Labels and titles and stuff.
-        ax = plt.gca()
+    # Labels and titles and stuff.
+    ax = plt.gca()
 
-        handles, labels = ax.get_legend_handles_labels()
+    handles, labels = ax.get_legend_handles_labels()
 
-        # TODO: generalize this
-        if dataseries == 'age':
-            labels = list(map(config['age_labels'].get, labels))
+    # TODO: generalize this
+    if dataseries == 'age':
+        labels = list(map(config['age_labels'].get, labels))
 
-        if view == 'stacked':
-            handles[:] = handles[::-1]
-            labels[:] = labels[::-1]
+    if params['view'] == 'stacked':
+        handles[:] = handles[::-1]
+        labels[:] = labels[::-1]
 
-        plt.legend(handles, labels, loc='center left',
-                   bbox_to_anchor=(1.0, 0.5))
+    plt.legend(handles, labels, loc='center left',
+                bbox_to_anchor=(1.0, 0.5))
 
-        madlibs = {'dataseries': dataseries,
-                   'dataset': dataset,
-                   'view': view,
-                   'dataseries_label': config['dataseries_labels'][dataseries],
-                   'dataset_label': config['dataset_labels'][dataset],
-                   'view_label': config['view_labels'][view]}
+    madlibs = {'dataseries': dataseries,
+                'dataset': dataset,
+                'view': params['view'],
+                'dataseries_label': config['dataseries_labels'][dataseries],
+                'dataset_label': config['dataset_labels'][dataset],
+                'view_label': config['view_labels'][params['view']]}
 
-        if 'title' in params:
-            plt.suptitle(Template(params['title']).safe_substitute(madlibs),
-                         fontsize=24)
+    if 'title' in params:
+        plt.suptitle(Template(params['title']).safe_substitute(madlibs),
+                        fontsize=24)
 
-        # FIX: make work
-        if 'subtitle' in params:
-            graph.set_title(
-                Template(params['subtitle']).safe_substitute(madlibs),
-                fontsize=14)
+    # FIX: make work
+    if 'subtitle' in params:
+        graph.set_title(
+            Template(params['subtitle']).safe_substitute(madlibs),
+            fontsize=14)
 
-        plt.autoscale(enable=True, axis='x', tight=True)
-        plt.autoscale(enable=True, axis='y', tight=False)
-        graph.set_ylim([0, None])
-        graph.spines['right'].set_visible(False)
-        graph.spines['top'].set_visible(False)
-        sFormatter = m.ticker.ScalarFormatter()
-        sFormatter.set_scientific(False)
-        graph.yaxis.set_major_formatter(sFormatter)
-        # graph.xaxis.set_major_formatter(dates.DateFormatter('%b \'%y'))
-        graph.set_xlabel('')
+    plt.autoscale(enable=True, axis='x', tight=True)
+    plt.autoscale(enable=True, axis='y', tight=False)
+    graph.set_ylim([0, None])
+    graph.spines['right'].set_visible(False)
+    graph.spines['top'].set_visible(False)
+    sFormatter = m.ticker.ScalarFormatter()
+    sFormatter.set_scientific(False)
+    graph.yaxis.set_major_formatter(sFormatter)
+    # graph.xaxis.set_major_formatter(dates.DateFormatter('%b \'%y'))
+    graph.set_xlabel('')
 
-        for ext in config['image_types']:
-            graph.figure.savefig(f"images/{ext}/{dataset}/{Template(params['filebase']).safe_substitute(madlibs)}.{ext}",
-                                 dpi=config['dpi'], bbox_inches="tight")
+    for ext in config['image_types']:
+        graph.figure.savefig(f"images/{ext}/{dataset}/{Template(params['filebase']).safe_substitute(madlibs)}.{ext}",
+                                dpi=config['dpi'], bbox_inches="tight")
 
-        plt.close(graph.figure)
-        print(f"{Template(params['filebase']).safe_substitute(madlibs)}.{ext}")
+    plt.close(graph.figure)
+    print(f"{Template(params['filebase']).safe_substitute(madlibs)}.{ext}")
 
 
 ###########################################
@@ -236,64 +164,81 @@ def graph_releasebars(config, colormappings, params, dataframe):
 
 def main():
 
-    config = toml.load("config.toml")
+    arguments = docopt(__doc__, version='0.1')
+
 
+    config = toml.load("config.toml")
 
     colormappings = load_color_cache(config['color_cache'],config['color_presets'])
 
     database = sqlite3.connect(DATAFILE, detect_types=sqlite3.PARSE_DECLTYPES)
 
+    cur = database.cursor()
+    cur.execute("SELECT DATE(week, '6 days') FROM checkins ORDER BY week DESC LIMIT 1")
+    timestamp = cur.fetchone()
+    print(f"TIME: {timestamp}")
+
+    #pprint(arguments)
+
     if 'timeseries' in config:
         for timeseries in config['timeseries']:
             params = config['timeseries_defaults'].copy()
             params.update(timeseries)
-
-            query = f"""SELECT week, {params['dataseries']}, SUM(hits) as hits
-                    FROM checkins
-                    WHERE dataset =\"{params['dataset']}\"
-                    {params['extraselect']}
-                    GROUP BY week, {params['dataseries']}
-                    ORDER BY week
-                """
-            df = pd.read_sql_query(query, parse_dates='week',   con=database)
-
-            graph_timeseries(
-                config=config,
-                colormappings=colormappings,
-                params=params,
-                dataframe=df.pivot(index='week', columns=params['dataseries'],
-                                values='hits').astype("Int64"),
-            )
+            params['type'] = 'timeseries'
+
+            for view in params['views']:
+                params['view']=view
+
+                query = f"""SELECT week, {params['dataseries']}, SUM(hits) as hits
+                        FROM checkins
+                        WHERE dataset =\"{params['dataset']}\"
+                        {params['extraselect']}
+                        GROUP BY week, {params['dataseries']}
+                        ORDER BY week
+                    """
+                df = pd.read_sql_query(query, parse_dates='week',   con=database)
+
+                graph(
+                    config=config,
+                    colormappings=colormappings,
+                    params=params,
+                    dataframe=df.pivot(index='week', columns=params['dataseries'],
+                                    values='hits').astype("Int64"),
+                )
 
     if 'byrelease' in config:
         for byrelease in config['byrelease']:
             params = config['byrelease_defaults'].copy()
             params.update(byrelease)
-
-            query = f"""SELECT release, {params['dataseries']}, SUM(hits) as hits
-                    FROM peak
-                    WHERE dataset =\"{params['dataset']}\"
-                    {params['extraselect']}
-                    GROUP BY release, {params['dataseries']}
-                    ORDER BY release
-                """
-            df = pd.read_sql_query(query, con=database)
-
-            graph_releasebars(
-                config=config,
-                colormappings=colormappings,
-                params=params,
-                dataframe=df.pivot(index='release', columns=params['dataseries'],
-                                values='hits').astype("Int64"),
-            )
-
-
-'''
-### getting ahead of myself: this is for the waffle charts
-query = """SELECT * from checkins where week >= (SELECT DISTINCT(week) FROM checkins ORDER BY week DESC LIMIT 4,1)"""
-df = pd.read_sql_query(query, parse_dates='week', con=database)
-df
-'''
+            params['type'] = 'byrelease'
+
+            for view in params['views']:
+                params['view']=view
+                    
+                query = f"""SELECT release, {params['dataseries']}, SUM(hits) as hits
+                        FROM peak
+                        WHERE dataset =\"{params['dataset']}\"
+                        {params['extraselect']}
+                        GROUP BY release, {params['dataseries']}
+                        ORDER BY release
+                    """
+                df = pd.read_sql_query(query, con=database)
+
+                graph(
+                    config=config,
+                    colormappings=colormappings,
+                    params=params,
+                    dataframe=df.pivot(index='release', columns=params['dataseries'],
+                                    values='hits').astype("Int64"),
+                )
+
+
+    '''
+    ### getting ahead of myself: this is for the waffle charts
+    query = """SELECT * from checkins where week >= (SELECT DISTINCT(week) FROM checkins ORDER BY week DESC LIMIT 4,1)"""
+    df = pd.read_sql_query(query, parse_dates='week', con=database)
+    df
+    '''
 
 if __name__ == "__main__":
     main()
diff --git a/config.toml b/config.toml
index 01163b4..2a52e6c 100644
--- a/config.toml
+++ b/config.toml
@@ -3,7 +3,7 @@ datafile = "db/bronto.db"
 color_presets = "color-presets.toml"
 color_cache = "db/color-cache.toml"
 
-ephemeral = "all"
+imagepath="images/$filetype/$dataset/"
 
 figsize = [16, 9]
 dpi = 300
@@ -67,8 +67,8 @@ age="age category"
 
 [timeseries_defaults]
 title="$dataset_label: weekly checkins by $dataseries_label$view_label"
-filebase="$dataset-timeseries-$dataseries-$view"
 extraselect=""
+filebase="$dataset-timeseries-$dataseries-$view"
 # not all of these are implemented. But we could have...
 #subtitle=
 #dataset=
@@ -78,6 +78,14 @@ extraselect=""
 # todo: back to the idea of reading these 
 # from individual, merged configuration files!
 
+[byrelease_defaults]
+title="$dataset_label: $dataseries_label by release"
+subtitle="data for each release taken from the week of that release's (current) peak"
+filebase="$dataset-byrelease-$dataseries-$view"
+extraselect=""
+views=['stacked','share']
+
+
 [[timeseries]]
 dataset="fedora_updates_systems"
 dataseries="release"
@@ -440,13 +448,6 @@ views=['line','share']
 extraselect="AND age>0"
 filebase="$dataset-timeseries-$dataseries-$view-persistent"
 
-[byrelease_defaults]
-title="$dataset_label: $dataseries_label by release"
-subtitle="data for each release taken from the week of that release's (current) peak"
-filebase="$dataset-byrelease-$dataseries-$view"
-extraselect=""
-views=['stacked','share']
-
 [[byrelease]]
 dataset="fedora_updates_systems"
 dataseries="age"

From 926ddf1ebf99c93a02bc997680c93fca860cbff7 Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 24 2022 19:09:29 +0000
Subject: [PATCH 21/49] reorg


---

diff --git a/brontosaurus_colorizer.py b/brontosaurus_colorizer.py
index c39cb4f..69b18eb 100755
--- a/brontosaurus_colorizer.py
+++ b/brontosaurus_colorizer.py
@@ -8,65 +8,18 @@ This is saved to
 from pprint import pprint
 import sqlite3
 
-from collections import defaultdict
-from collections import OrderedDict
-
 
 import toml
 import re
 
-from deepmerge import always_merger
-
-
-def get_colors(colormappings, colorlist, dataset, dataseries, items):
-    """This makes colors 'sticky' for the whole run."""
-
-    key = dataset + '.' + dataseries
-
-    # for each label item, assign the next color in the colorlist
-    # and save that for later.
-    outcolors = []
-    for item in items:
-        if str(item) not in colormappings[key]:
-            print(f"Miss {item} in {key}")
-            colormappings[key][str(item)] = colorlist[len(colormappings[key]) % len(colorlist)]
-        outcolors.append(colormappings[key][str(item)])
-
-    return outcolors
-
-
-def load_color_cache(cachefile,presetfile):
-
-    colormappings = defaultdict(OrderedDict)
-    try:
-        cached = toml.load(cachefile)
-    except FileNotFoundError:
-        print(f"Can't open color cache {cachefile}, so starting fresh.")
-        cached = {}
-
-    try:
-        presets = toml.load(presetfile)
-        always_merger.merge(cached,presets)
-    except FileNotFoundError:
-        print(f"No color preset file {presetfile} found.")
-        
-
-    # gotta do this because we want a defaultdict but
-    # toml load just gives us a regular dict.
-    for key in cached.keys():
-        # and this because we want the item keys to be strings
-        # even if they look like integers
-        for (item,color) in cached[key].items():
-            colormappings[key][str(item)] = cached[key][item]
-
-    return colormappings
+from brontosaurusifier_utils import colormapping
         
 
 def main():
 
     config = toml.load("config.toml")
 
-    colormappings = load_color_cache(config['color_cache'],config['color_presets'])
+    colormappings = colormapping.load_color_cache(config['color_cache'],config['color_presets'])
 
     database = sqlite3.connect(config['datafile'], detect_types=sqlite3.PARSE_DECLTYPES)
     cur = database.cursor()
@@ -107,7 +60,7 @@ def main():
             items = [t[0] for t in cur.fetchall()]
             #print(dataset,dataseries,items)
             
-            get_colors(colormappings, config['colors'], dataset, dataseries, items)
+            colormapping.get_colors(colormappings, config['colors'], dataset, dataseries, items)
 
 
     with open(config['color_cache'], "w") as toml_file:
diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py
index 46d972c..e382d8a 100755
--- a/brontosaurus_plotter.py
+++ b/brontosaurus_plotter.py
@@ -13,7 +13,7 @@ Options:
   --cutoff <n>        Drop items where the dataseries has less than n total hits
 """
 
-import matplotlib.dates as dates
+#import matplotlib.dates as dates
 import matplotlib.pyplot as plt
 
 import sqlite3
@@ -28,7 +28,7 @@ import matplotlib as m
 
 from docopt import docopt
 
-from brontosaurus_colorizer import load_color_cache, get_colors
+from brontosaurusifier_utils import colormapping
 
 DATAFILE = 'db/bronto.db'
 
@@ -65,7 +65,7 @@ def graph(config, colormappings, params, dataframe):
     # our colors.
     # the complication here is keeping the same color for the same label
     # across multiple graphs!
-    cmap = m.colors.ListedColormap(get_colors(colormappings, config['colors'],
+    cmap = m.colors.ListedColormap(colormapping.get_colors(colormappings, config['colors'],
                                               dataset, dataseries, list(dataframe.columns)))
 
     ##################
@@ -169,7 +169,7 @@ def main():
 
     config = toml.load("config.toml")
 
-    colormappings = load_color_cache(config['color_cache'],config['color_presets'])
+    colormappings = colormapping.load_color_cache(config['color_cache'],config['color_presets'])
 
     database = sqlite3.connect(DATAFILE, detect_types=sqlite3.PARSE_DECLTYPES)
 
diff --git a/brontosaurusifier_utils/__init__.py b/brontosaurusifier_utils/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/brontosaurusifier_utils/__init__.py
diff --git a/brontosaurusifier_utils/colormapping.py b/brontosaurusifier_utils/colormapping.py
new file mode 100644
index 0000000..c338a07
--- /dev/null
+++ b/brontosaurusifier_utils/colormapping.py
@@ -0,0 +1,52 @@
+
+
+from collections import defaultdict
+from collections import OrderedDict
+
+
+import toml
+
+
+from deepmerge import always_merger
+
+def get_colors(colormappings, colorlist, dataset, dataseries, items):
+    """This makes colors 'sticky' for the whole run."""
+
+    key = dataset + '.' + dataseries
+
+    # for each label item, assign the next color in the colorlist
+    # and save that for later.
+    outcolors = []
+    for item in items:
+        if str(item) not in colormappings[key]:
+            colormappings[key][str(item)] = colorlist[len(colormappings[key]) % len(colorlist)]
+        outcolors.append(colormappings[key][str(item)])
+
+    return outcolors
+
+
+def load_color_cache(cachefile,presetfile):
+
+    colormappings = defaultdict(OrderedDict)
+    try:
+        cached = toml.load(cachefile)
+    except FileNotFoundError:
+        print(f"Can't open color cache {cachefile}, so starting fresh.")
+        cached = {}
+
+    try:
+        presets = toml.load(presetfile)
+        always_merger.merge(cached,presets)
+    except FileNotFoundError:
+        print(f"No color preset file {presetfile} found.")
+        
+
+    # gotta do this because we want a defaultdict but
+    # toml load just gives us a regular dict.
+    for key in cached.keys():
+        # and this because we want the item keys to be strings
+        # even if they look like integers
+        for (item,color) in cached[key].items():
+            colormappings[key][str(item)] = cached[key][item]
+
+    return colormappings
\ No newline at end of file

From dbcab8e0f446594886b1895257ee13f3e2b9add3 Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 24 2022 20:12:55 +0000
Subject: [PATCH 22/49] wip


---

diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py
index e382d8a..d60062e 100755
--- a/brontosaurus_plotter.py
+++ b/brontosaurus_plotter.py
@@ -3,9 +3,8 @@
 Brontosaurus Plotter
 
 Usage:
-  brontosaurus_plotter.py timeseries ( line | stacked | share ) <dataset> <dataseries> [--exclude <arg>...|--include <arg>...] [--cutoff <n>]
-  brontosaurus_plotter.py byrelease ( stacked | split ) <dataset> <dataseries> [--exclude <arg>...|--include <arg>...] [--cutoff <n>]
-  brontosaurus_plotter.py show <dataset> <dataseries>
+  brontosaurus_plotter.py timeseries ( text | stacked | share | line )  <dataset> <dataseries> [--exclude <arg>...|--include <arg>...] [--cutoff <n>]
+  brontosaurus_plotter.py releasebars  ( text | stacked | share | split ) <dataset> <dataseries> [--exclude <arg>...|--include <arg>...] [--cutoff <n>]
 
 Options:
   --exclude <arg>...  In the form `dataseries:element`. Can repeat.
@@ -24,6 +23,8 @@ from pprint import pprint
 import pandas as pd
 import toml
 
+import re
+
 import matplotlib as m
 
 from docopt import docopt
@@ -44,7 +45,7 @@ m.rcParams['legend.frameon'] = False
 
 
-def graph(config, colormappings, params, dataframe):
+def draw_graph(config, colormappings, params, dataframe):
     
     # If we find we have missing data, in the future:
     # dataframe.resample('W-MON')
@@ -75,19 +76,15 @@ def graph(config, colormappings, params, dataframe):
     startdate = config['startdate'][dataset.split('_', 1)[0]]
 
     stacked = True
+    subplots = False
 
     match params['type']:
-        case 'byrelease':
+        case 'releasebars':
             kind = 'bar'
         case 'timeseries':
             kind = 'area'
     
-    match params['view']:
-        case 'line':
-            df = dataframe[startdate:]
-            kind = 'line' # overrides 'area'
-            stacked = False # true everywhere else!
-            colormap = cmap
+    match params['graph']:
         case 'stacked':
             df = dataframe[startdate:][dataframe.columns[::-1]]
             colormap = m.colors.ListedColormap(cmap.colors[::-1])
@@ -102,10 +99,26 @@ def graph(config, colormappings, params, dataframe):
                 df = dataframe[startdate:].div(
                     dataframe.sum(axis=1), axis=0)*100
                 colormap = cmap
+        case 'line':
+            """ This is timeseries-only. """
+            df = dataframe[startdate:]
+            kind = 'line' # overrides 'area'
+            stacked = False # true everywhere else!
+            colormap = cmap
+        case 'split':
+            """ This is releasebars-only. """
+            df = dataframe[startdate:]
+            colormap = cmap
+            subplots = True
+            
+
 
     # Start the actual graph
     graph = df.plot(figsize=config['figsize'],
-                    colormap=colormap, kind=kind, stacked=stacked)
+                    colormap=colormap,
+                    kind=kind,
+                    stacked=stacked,
+                    subplots=subplots)
 
     # Labels and titles and stuff.
     ax = plt.gca()
@@ -176,69 +189,87 @@ def main():
     cur = database.cursor()
     cur.execute("SELECT DATE(week, '6 days') FROM checkins ORDER BY week DESC LIMIT 1")
     timestamp = cur.fetchone()
-    print(f"TIME: {timestamp}")
-
-    #pprint(arguments)
-
-    if 'timeseries' in config:
-        for timeseries in config['timeseries']:
-            params = config['timeseries_defaults'].copy()
-            params.update(timeseries)
-            params['type'] = 'timeseries'
-
-            for view in params['views']:
-                params['view']=view
-
-                query = f"""SELECT week, {params['dataseries']}, SUM(hits) as hits
-                        FROM checkins
-                        WHERE dataset =\"{params['dataset']}\"
-                        {params['extraselect']}
-                        GROUP BY week, {params['dataseries']}
-                        ORDER BY week
-                    """
-                df = pd.read_sql_query(query, parse_dates='week',   con=database)
-
-                graph(
-                    config=config,
-                    colormappings=colormappings,
-                    params=params,
-                    dataframe=df.pivot(index='week', columns=params['dataseries'],
-                                    values='hits').astype("Int64"),
-                )
 
-    if 'byrelease' in config:
-        for byrelease in config['byrelease']:
-            params = config['byrelease_defaults'].copy()
-            params.update(byrelease)
-            params['type'] = 'byrelease'
-
-            for view in params['views']:
-                params['view']=view
-                    
-                query = f"""SELECT release, {params['dataseries']}, SUM(hits) as hits
-                        FROM peak
-                        WHERE dataset =\"{params['dataset']}\"
-                        {params['extraselect']}
-                        GROUP BY release, {params['dataseries']}
-                        ORDER BY release
-                    """
-                df = pd.read_sql_query(query, con=database)
-
-                graph(
-                    config=config,
-                    colormappings=colormappings,
-                    params=params,
-                    dataframe=df.pivot(index='release', columns=params['dataseries'],
+    params = config['graph_defaults'].copy()
+    params['timestamp'] = timestamp
+
+    cur.execute("SELECT dataset FROM checkins GROUP BY dataset")
+    datasets = [t[0] for t in cur.fetchall()]
+    cur.execute("SELECT * FROM checkins LIMIT 1").fetchall()
+    dataserieses = [t[0] for t in cur.description]
+    dataserieses.remove('week')
+    dataserieses.remove('dataset')
+    dataserieses.remove('hits')
+
+    pprint(arguments)
+
+    
+    dataset=arguments['<dataset>']
+    dataseries=arguments['<dataseries>']
+
+    if not re.match('^[0-9a-z_]*$', dataset):
+        print(f"Bad dataset name! '%{dataset}")
+        exit(1)
+    if not dataset in datasets:
+        print(f"Dataset '%{dataset}' not in database.")
+        exit(1)
+    if not re.match('^[0-9a-z_]*$', dataseries):
+        print(f"Bad dataseries name! '%{dataseries}")
+        exit(1)
+    if not dataseries in dataserieses:
+        print(f"Dataseries '%{dataseries}' not in database.")
+        exit(1)       
+
+
+    
+    if arguments['timeseries']:
+        params['type'] = 'timeseries'
+    elif arguments['releasebars']:
+        params['type'] = 'releasebars'
+    # TODO: waffle!
+    # read in defaults from config.toml
+    params.update(config[params['type']])
+
+    table = params['table']
+
+
+    # maybe docopt isn't the best choice. oh well.
+    for graphtype in [ 'text', 'stacked', 'share', 'split', 'line' ]:
+        if arguments[graphtype]:
+            params['graph']=graphtype
+            break
+        
+
+    if params['graph'] == 'text':            
+        query = f"""SELECT {dataseries},sum(hits) AS total
+                FROM {table}
+                WHERE dataset = '{dataset}'
+                GROUP BY {dataseries}
+                ORDER BY total
+                ASC
+        """
+        cur.execute(query)
+        # TODO: add title here
+        for (item,hits) in cur:
+            print(f"{hits:-10} — {item:40}") 
+    else:
+        xaxis = params['xaxis']
+
+        query = f"""SELECT {xaxis}, {dataseries}, SUM(hits) as hits
+                    FROM {table}
+                    WHERE dataset =\"{dataset}\"
+                    GROUP BY {xaxis}, {params['dataseries']}
+                    ORDER BY {xaxis}
+                 """
+        df = pd.read_sql_query(query, parse_dates='week', con=database)
+
+        draw_graph(config=config,
+                   colormappings=colormappings,
+                   params=params,
+                   dataframe=df.pivot(index='week', columns=params['dataseries'],
                                     values='hits').astype("Int64"),
                 )
 
 
-    '''
-    ### getting ahead of myself: this is for the waffle charts
-    query = """SELECT * from checkins where week >= (SELECT DISTINCT(week) FROM checkins ORDER BY week DESC LIMIT 4,1)"""
-    df = pd.read_sql_query(query, parse_dates='week', con=database)
-    df
-    '''
-
 if __name__ == "__main__":
     main()
diff --git a/config.toml b/config.toml
index 2a52e6c..b65d634 100644
--- a/config.toml
+++ b/config.toml
@@ -65,474 +65,19 @@ age="age category"
 'stacked'=" (stacked)"
 'share'=" (share)"
 
-[timeseries_defaults]
-title="$dataset_label: weekly checkins by $dataseries_label$view_label"
+[graph_defaults]
+table="checkins"
+title="$dataset_label: $dataseries_label $type_label"
+# TODO: list options!
+
+[timeseries]
+subtitle="weekly checkins"
+xaxis = "week"
 extraselect=""
 filebase="$dataset-timeseries-$dataseries-$view"
-# not all of these are implemented. But we could have...
-#subtitle=
-#dataset=
-#dataseries=
-#orderbyhits=
-#reverse=
-# todo: back to the idea of reading these 
-# from individual, merged configuration files!
 
-[byrelease_defaults]
-title="$dataset_label: $dataseries_label by release"
+[releasebars]
+table="peak"
+xaxis = "release"
 subtitle="data for each release taken from the week of that release's (current) peak"
-filebase="$dataset-byrelease-$dataseries-$view"
-extraselect=""
-views=['stacked','share']
-
-
-[[timeseries]]
-dataset="fedora_updates_systems"
-dataseries="release"
-views=['line','stacked','share']
-filebase="$dataset-timeseries-$dataseries-$view-all"
-
-[[timeseries]]
-subtitle="ephemeral systems"
-dataset="fedora_updates_systems"
-dataseries="release"
-views=['line','stacked','share']
-extraselect="AND age=0"
-filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
-
-[[timeseries]]
-subtitle="persistent systems"
-dataset="fedora_updates_systems"
-dataseries="release"
-views=['line','stacked','share']
-extraselect="AND age>0"
-filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
-[[timeseries]]
-dataset="fedora_updates_systems"
-dataseries="age"
-views=['share','stacked']
-filebase="$dataset-timeseries-$dataseries-$view-all"
-
-[[timeseries]]
-dataset="fedora_updates_systems"
-dataseries="arch"
-views=['line','share']
-filebase="$dataset-timeseries-$dataseries-$view-all"
-
-[[timeseries]]
-subtitle="ephemeral systems"
-dataset="fedora_updates_systems"
-dataseries="arch"
-views=['line','share']
-extraselect="AND age=0"
-filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
-
-[[timeseries]]
-subtitle="persistent systems"
-dataset="fedora_updates_systems"
-dataseries="arch"
-views=['line','share']
-extraselect="AND age>0"
-filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
-[[timeseries]]
-dataset="fedora_updates_systems"
-dataseries="variant"
-views=['line','share']
-filebase="$dataset-timeseries-$dataseries-$view-all"
-
-[[timeseries]]
-subtitle="ephemeral systems"
-dataset="fedora_updates_systems"
-dataseries="variant"
-views=['line','share']
-extraselect="AND age=0"
-filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
-
-[[timeseries]]
-subtitle="persistent systems"
-dataset="fedora_updates_systems"
-dataseries="variant"
-views=['line','share']
-extraselect="AND age>0"
-filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
-[[timeseries]]
-dataset="fedora_updates_containers"
-dataseries="release"
-views=['line','stacked','share']
-filebase="$dataset-timeseries-$dataseries-$view-all"
-
-[[timeseries]]
-subtitle="ephemeral systems"
-dataset="fedora_updates_containers"
-dataseries="release"
-views=['line','stacked','share']
-extraselect="AND age=0"
-filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
-
-[[timeseries]]
-subtitle="persistent systems"
-dataset="fedora_updates_containers"
-dataseries="release"
-views=['line','stacked','share']
-extraselect="AND age>0"
-filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
-[[timeseries]]
-dataset="fedora_updates_containers"
-dataseries="age"
-views=['share','stacked']
-filebase="$dataset-timeseries-$dataseries-$view-all"
-
-[[timeseries]]
-dataset="fedora_updates_containers"
-dataseries="arch"
-views=['line','share']
-filebase="$dataset-timeseries-$dataseries-$view-all"
-
-[[timeseries]]
-subtitle="ephemeral systems"
-dataset="fedora_updates_containers"
-dataseries="arch"
-views=['line','share']
-extraselect="AND age=0"
-filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
-
-[[timeseries]]
-subtitle="persistent systems"
-dataset="fedora_updates_containers"
-dataseries="arch"
-views=['line','share']
-extraselect="AND age>0"
-filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
-[[timeseries]]
-dataset="fedora_updates_containers"
-dataseries="variant"
-views=['line','share']
-filebase="$dataset-timeseries-$dataseries-$view-all"
-
-[[timeseries]]
-subtitle="ephemeral systems"
-dataset="fedora_updates_containers"
-dataseries="variant"
-views=['line','share']
-extraselect="AND age=0"
-filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
-
-[[timeseries]]
-subtitle="persistent systems"
-dataset="fedora_updates_containers"
-dataseries="variant"
-views=['line','share']
-extraselect="AND age>0"
-filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
-
-[[timeseries]]
-dataset="fedora_rawhide_systems"
-dataseries="release"
-views=['line','stacked','share']
-filebase="$dataset-timeseries-$dataseries-$view-all"
-
-[[timeseries]]
-subtitle="ephemeral systems"
-dataset="fedora_rawhide_systems"
-dataseries="release"
-views=['line','stacked','share']
-extraselect="AND age=0"
-filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
-
-[[timeseries]]
-subtitle="persistent systems"
-dataset="fedora_rawhide_systems"
-dataseries="release"
-views=['line','stacked','share']
-extraselect="AND age>0"
-filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
-[[timeseries]]
-dataset="fedora_rawhide_systems"
-dataseries="age"
-views=['share','stacked']
-
-[[timeseries]]
-dataset="fedora_rawhide_systems"
-dataseries="arch"
-views=['line','share']
-filebase="$dataset-timeseries-$dataseries-$view-all"
-
-[[timeseries]]
-subtitle="ephemeral systems"
-dataset="fedora_rawhide_systems"
-dataseries="arch"
-views=['line','share']
-extraselect="AND age=0"
-filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
-
-[[timeseries]]
-subtitle="persistent systems"
-dataset="fedora_rawhide_systems"
-dataseries="arch"
-views=['line','share']
-extraselect="AND age>0"
-filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
-[[timeseries]]
-dataset="fedora_rawhide_systems"
-dataseries="variant"
-views=['line','share']
-filebase="$dataset-timeseries-$dataseries-$view-all"
-
-[[timeseries]]
-subtitle="ephemeral systems"
-dataset="fedora_rawhide_systems"
-dataseries="variant"
-views=['line','share']
-extraselect="AND age=0"
-filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
-
-[[timeseries]]
-subtitle="persistent systems"
-dataset="fedora_rawhide_systems"
-dataseries="variant"
-views=['line','share']
-extraselect="AND age>0"
-filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
-
-[[timeseries]]
-dataset="fedora_rawhide_containers"
-dataseries="release"
-views=['line','stacked','share']
-filebase="$dataset-timeseries-$dataseries-$view-all"
-
-[[timeseries]]
-subtitle="ephemeral systems"
-dataset="fedora_rawhide_containers"
-dataseries="release"
-views=['line','stacked','share']
-extraselect="AND age=0"
-filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
-
-[[timeseries]]
-subtitle="persistent systems"
-dataset="fedora_rawhide_containers"
-dataseries="release"
-views=['line','stacked','share']
-extraselect="AND age>0"
-filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
-[[timeseries]]
-dataset="fedora_rawhide_containers"
-dataseries="age"
-views=['share','stacked']
-filebase="$dataset-timeseries-$dataseries-$view-all"
-
-[[timeseries]]
-dataset="fedora_rawhide_containers"
-dataseries="arch"
-views=['line','share']
-filebase="$dataset-timeseries-$dataseries-$view-all"
-
-[[timeseries]]
-subtitle="ephemeral systems"
-dataset="fedora_rawhide_containers"
-dataseries="arch"
-views=['line','share']
-extraselect="AND age=0"
-filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
-
-[[timeseries]]
-subtitle="persistent systems"
-dataset="fedora_rawhide_containers"
-dataseries="arch"
-views=['line','share']
-extraselect="AND age>0"
-filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
-[[timeseries]]
-dataset="fedora_rawhide_containers"
-dataseries="variant"
-views=['line','share']
-filebase="$dataset-timeseries-$dataseries-$view-all"
-
-[[timeseries]]
-subtitle="ephemeral systems"
-dataset="fedora_rawhide_containers"
-dataseries="variant"
-views=['line','share']
-extraselect="AND age=0"
-filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
-
-[[timeseries]]
-subtitle="persistent systems"
-dataset="fedora_rawhide_containers"
-dataseries="variant"
-views=['line','share']
-extraselect="AND age>0"
-filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
-
-[[timeseries]]
-dataset="epel"
-dataseries="release"
-views=['line','stacked','share']
-filebase="$dataset-timeseries-$dataseries-$view-all"
-
-[[timeseries]]
-subtitle="ephemeral systems"
-dataset="epel"
-dataseries="release"
-views=['line','stacked','share']
-extraselect="AND age=0"
-filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
-
-[[timeseries]]
-subtitle="persistent systems"
-dataset="epel"
-dataseries="release"
-views=['line','stacked','share']
-extraselect="AND age>0"
-filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
-[[timeseries]]
-dataset="epel"
-dataseries="age"
-views=['share','stacked']
-filebase="$dataset-timeseries-$dataseries-$view-all"
-
-[[timeseries]]
-dataset="epel"
-dataseries="arch"
-views=['line','share']
-filebase="$dataset-timeseries-$dataseries-$view-all"
-
-[[timeseries]]
-subtitle="ephemeral systems"
-dataset="epel"
-dataseries="arch"
-views=['line','share']
-extraselect="AND age=0"
-filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
-
-[[timeseries]]
-subtitle="persistent systems"
-dataset="epel"
-dataseries="arch"
-views=['line','share']
-extraselect="AND age>0"
-filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
-[[timeseries]]
-dataset="epel"
-dataseries="variant"
-views=['line','share']
-filebase="$dataset-timeseries-$dataseries-$view-all"
-
-[[timeseries]]
-subtitle="ephemeral systems"
-dataset="epel"
-dataseries="variant"
-views=['line','share']
-extraselect="AND age=0"
-filebase="$dataset-timeseries-$dataseries-$view-ephemeral"
-
-[[timeseries]]
-subtitle="persistent systems"
-dataset="epel"
-dataseries="variant"
-views=['line','share']
-extraselect="AND age>0"
-filebase="$dataset-timeseries-$dataseries-$view-persistent"
-
-[[byrelease]]
-dataset="fedora_updates_systems"
-dataseries="age"
-
-[[byrelease]]
-dataset="fedora_updates_systems"
-dataseries="arch"
-filebase="$dataset-byrelease-$dataseries-$view-all"
-
-[[byrelease]]
-dataset="fedora_updates_systems"
-dataseries="arch"
-extraselect="AND age=0"
-filebase="$dataset-byrelease-$dataseries-$view-ephemeral"
-
-[[byrelease]]
-dataset="fedora_updates_systems"
-dataseries="arch"
-extraselect="AND age>0"
-filebase="$dataset-byrelease-$dataseries-$view-persistent"
-
-[[byrelease]]
-dataset="fedora_updates_systems"
-dataseries="variant"
-
-[[byrelease]]
-dataset="fedora_updates_systems"
-dataseries="variant"
-extraselect="AND age=0"
-filebase="$dataset-byrelease-$dataseries-$view-ephemeral"
-
-[[byrelease]]
-dataset="fedora_updates_systems"
-dataseries="variant"
-extraselect="AND age>0"
-filebase="$dataset-byrelease-$dataseries-$view-persistent"
-
-[[byrelease]]
-dataset="fedora_updates_containers"
-dataseries="age"
-
-[[byrelease]]
-dataset="fedora_updates_containers"
-dataseries="arch"
-
-[[byrelease]]
-dataset="fedora_updates_containers"
-dataseries="variant"
-
-
-[[byrelease]]
-dataset="fedora_rawhide_systems"
-dataseries="age"
-
-[[byrelease]]
-dataset="fedora_rawhide_systems"
-dataseries="arch"
-
-[[byrelease]]
-dataset="fedora_rawhide_systems"
-dataseries="variant"
-
-
-[[byrelease]]
-dataset="fedora_rawhide_containers"
-dataseries="age"
-
-[[byrelease]]
-dataset="fedora_rawhide_containers"
-dataseries="arch"
-
-[[byrelease]]
-dataset="fedora_rawhide_containers"
-dataseries="variant"
-
-
-[[byrelease]]
-dataset="epel"
-dataseries="age"
-
-[[byrelease]]
-dataset="epel"
-dataseries="arch"
-
 
-[[byrelease]]
-dataset="epel"
-dataseries="variant"

From 1934955a652347e3860f50f862101e4e8491fbe4 Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 24 2022 23:24:00 +0000
Subject: [PATCH 23/49] ahahaa


---

diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py
index d60062e..17bf045 100755
--- a/brontosaurus_plotter.py
+++ b/brontosaurus_plotter.py
@@ -44,22 +44,113 @@ m.rcParams['legend.frameon'] = False
 
 
+def main():
+
+    arguments = docopt(__doc__, version='0.1')
+
+
+    config = toml.load("config.toml")
+
+    colormappings = colormapping.load_color_cache(config['color_cache'],config['color_presets'])
+
+    database = sqlite3.connect(DATAFILE, detect_types=sqlite3.PARSE_DECLTYPES)
+
+    cur = database.cursor()
+    cur.execute("SELECT DATE(week, '6 days') FROM checkins ORDER BY week DESC LIMIT 1")
+    timestamp = cur.fetchone()
+
+    params = config['graph_defaults'].copy()
+    params['timestamp'] = timestamp
+
+    cur.execute("SELECT dataset FROM checkins GROUP BY dataset")
+    datasets = [t[0] for t in cur.fetchall()]
+    cur.execute("SELECT * FROM checkins LIMIT 1").fetchall()
+    dataserieses = [t[0] for t in cur.description]
+    dataserieses.remove('week')
+    dataserieses.remove('dataset')
+    dataserieses.remove('hits')
+
+    #pprint(arguments)
+
+    
+    dataset=arguments['<dataset>']
+    dataseries=arguments['<dataseries>']
+
+    if not re.match('^[0-9a-z_]*$', dataset):
+        print(f"Bad dataset name! '%{dataset}")
+        exit(1)
+    if not dataset in datasets:
+        print(f"Dataset '%{dataset}' not in database.")
+        exit(1)
+    if not re.match('^[0-9a-z_]*$', dataseries):
+        print(f"Bad dataseries name! '%{dataseries}")
+        exit(1)
+    if not dataseries in dataserieses:
+        print(f"Dataseries '%{dataseries}' not in database.")
+        exit(1)       
+
+    startdate = config['startdate'][dataset]
+
+    
+    if arguments['timeseries']:
+        params['type'] = 'timeseries'
+    elif arguments['releasebars']:
+        params['type'] = 'releasebars'
+    # TODO: waffle!
+
+    # read in defaults from config.toml
+    params.update(config[params['type']])
+
+    table = params['table']
+
+
+    # maybe docopt isn't the best choice. oh well.
+    for graphtype in [ 'text', 'stacked', 'share', 'split', 'line' ]:
+        if arguments[graphtype]:
+            params['graph']=graphtype
+            break
+        
 
-def draw_graph(config, colormappings, params, dataframe):
+    if params['graph'] == 'text':            
+        query = f"""SELECT {dataseries},sum(hits) AS total
+                FROM {table}
+                WHERE dataset = '{dataset}'
+                GROUP BY {dataseries}
+                ORDER BY total
+                ASC
+        """
+        cur.execute(query)
+        # TODO: add title here
+        for (item,hits) in cur:
+            print(f"{hits:-10} — {item:40}") 
+        exit(0)
     
-    # If we find we have missing data, in the future:
-    # dataframe.resample('W-MON')
+    xaxis = params['xaxis']
+
+    query = f"""SELECT {xaxis}, {dataseries}, SUM(hits) as hits
+                FROM {table}
+                WHERE dataset =\"{dataset}\"
+                GROUP BY {xaxis}, {dataseries}
+                ORDER BY {xaxis}
+                """
+    df = pd.read_sql_query(query, parse_dates='week', con=database)
+
+
+    dataframe=df.pivot(index=xaxis,
+                       columns=dataseries,
+                       values='hits'
+                      ).astype("Int64")
 
-    dataset = params['dataset']
-    dataseries = params['dataseries']
+    # Smooth over any missing data
+    #dataframe.resample('W-MON')
+    pprint(dataframe    )
 
     #################
     # Instead of this, accumulate anything more than 10 into "other"
     # ... and do it _elsewhere_ (easier to do before pivot anyway!)
     #  + limit number of columns to 10 + other
 
-    hidelist = dataframe.div(dataframe.sum(
-        axis=1), axis=0).max() < 0.2/100
+    hidelist = dataframe.div(dataframe.sum(axis=1), axis=0).max() < 0.2/100
     dataframe.drop(columns=(hidelist[hidelist == True].keys()), inplace=True)
 
     ##################
@@ -72,8 +163,6 @@ def draw_graph(config, colormappings, params, dataframe):
     ##################
     # and now.... graph it!
 
-    # FIXME: this is ugly
-    startdate = config['startdate'][dataset.split('_', 1)[0]]
 
     stacked = True
     subplots = False
@@ -129,7 +218,8 @@ def draw_graph(config, colormappings, params, dataframe):
     if dataseries == 'age':
         labels = list(map(config['age_labels'].get, labels))
 
-    if params['view'] == 'stacked':
+    # hmmm
+    if params['graph'] == 'stacked' or params['graph'] == 'shared':
         handles[:] = handles[::-1]
         labels[:] = labels[::-1]
 
@@ -138,10 +228,10 @@ def draw_graph(config, colormappings, params, dataframe):
 
     madlibs = {'dataseries': dataseries,
                 'dataset': dataset,
-                'view': params['view'],
+                'graph': params['graph'],
                 'dataseries_label': config['dataseries_labels'][dataseries],
                 'dataset_label': config['dataset_labels'][dataset],
-                'view_label': config['view_labels'][params['view']]}
+                'view_label': config['view_labels'][params['graph']]}
 
     if 'title' in params:
         plt.suptitle(Template(params['title']).safe_substitute(madlibs),
@@ -172,104 +262,5 @@ def draw_graph(config, colormappings, params, dataframe):
     print(f"{Template(params['filebase']).safe_substitute(madlibs)}.{ext}")
 
 
-###########################################
-
-
-def main():
-
-    arguments = docopt(__doc__, version='0.1')
-
-
-    config = toml.load("config.toml")
-
-    colormappings = colormapping.load_color_cache(config['color_cache'],config['color_presets'])
-
-    database = sqlite3.connect(DATAFILE, detect_types=sqlite3.PARSE_DECLTYPES)
-
-    cur = database.cursor()
-    cur.execute("SELECT DATE(week, '6 days') FROM checkins ORDER BY week DESC LIMIT 1")
-    timestamp = cur.fetchone()
-
-    params = config['graph_defaults'].copy()
-    params['timestamp'] = timestamp
-
-    cur.execute("SELECT dataset FROM checkins GROUP BY dataset")
-    datasets = [t[0] for t in cur.fetchall()]
-    cur.execute("SELECT * FROM checkins LIMIT 1").fetchall()
-    dataserieses = [t[0] for t in cur.description]
-    dataserieses.remove('week')
-    dataserieses.remove('dataset')
-    dataserieses.remove('hits')
-
-    pprint(arguments)
-
-    
-    dataset=arguments['<dataset>']
-    dataseries=arguments['<dataseries>']
-
-    if not re.match('^[0-9a-z_]*$', dataset):
-        print(f"Bad dataset name! '%{dataset}")
-        exit(1)
-    if not dataset in datasets:
-        print(f"Dataset '%{dataset}' not in database.")
-        exit(1)
-    if not re.match('^[0-9a-z_]*$', dataseries):
-        print(f"Bad dataseries name! '%{dataseries}")
-        exit(1)
-    if not dataseries in dataserieses:
-        print(f"Dataseries '%{dataseries}' not in database.")
-        exit(1)       
-
-
-    
-    if arguments['timeseries']:
-        params['type'] = 'timeseries'
-    elif arguments['releasebars']:
-        params['type'] = 'releasebars'
-    # TODO: waffle!
-    # read in defaults from config.toml
-    params.update(config[params['type']])
-
-    table = params['table']
-
-
-    # maybe docopt isn't the best choice. oh well.
-    for graphtype in [ 'text', 'stacked', 'share', 'split', 'line' ]:
-        if arguments[graphtype]:
-            params['graph']=graphtype
-            break
-        
-
-    if params['graph'] == 'text':            
-        query = f"""SELECT {dataseries},sum(hits) AS total
-                FROM {table}
-                WHERE dataset = '{dataset}'
-                GROUP BY {dataseries}
-                ORDER BY total
-                ASC
-        """
-        cur.execute(query)
-        # TODO: add title here
-        for (item,hits) in cur:
-            print(f"{hits:-10} — {item:40}") 
-    else:
-        xaxis = params['xaxis']
-
-        query = f"""SELECT {xaxis}, {dataseries}, SUM(hits) as hits
-                    FROM {table}
-                    WHERE dataset =\"{dataset}\"
-                    GROUP BY {xaxis}, {params['dataseries']}
-                    ORDER BY {xaxis}
-                 """
-        df = pd.read_sql_query(query, parse_dates='week', con=database)
-
-        draw_graph(config=config,
-                   colormappings=colormappings,
-                   params=params,
-                   dataframe=df.pivot(index='week', columns=params['dataseries'],
-                                    values='hits').astype("Int64"),
-                )
-
-
 if __name__ == "__main__":
     main()
diff --git a/config.toml b/config.toml
index b65d634..6676546 100644
--- a/config.toml
+++ b/config.toml
@@ -37,8 +37,11 @@ colors = [
 image_types = ["png"]
 
 [startdate]
-fedora = '2021-01-01' # F32 release not fully captured, so start here.
-epel   = '2021-01-01' # DNF feature launched in 8.3 at end of 2020
+fedora_updates_systems = '2021-01-01' # F32 release not fully captured, so start here.
+fedora_updates_containers = '2021-01-01'
+fedora_rawhide_systems = '2021-01-01'
+fedora_rawhide_containers = '2021-01-01'
+epel   = '2021-01-01'                 # DNF feature launched in 8.3 at end of 2020
 
 [dataset_labels]
 epel = "Extra Packages for Enterprise Linux"
@@ -68,13 +71,13 @@ age="age category"
 [graph_defaults]
 table="checkins"
 title="$dataset_label: $dataseries_label $type_label"
+filebase="$dataset-timeseries-$dataseries-$graph"
 # TODO: list options!
 
 [timeseries]
 subtitle="weekly checkins"
 xaxis = "week"
 extraselect=""
-filebase="$dataset-timeseries-$dataseries-$view"
 
 [releasebars]
 table="peak"

From b60b300d7b1596532c9729bc42153dbcb2d8a017 Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 24 2022 23:49:49 +0000
Subject: [PATCH 24/49] hacky way to generate 'em all


---

diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py
index 17bf045..0b8121f 100755
--- a/brontosaurus_plotter.py
+++ b/brontosaurus_plotter.py
@@ -4,7 +4,7 @@ Brontosaurus Plotter
 
 Usage:
   brontosaurus_plotter.py timeseries ( text | stacked | share | line )  <dataset> <dataseries> [--exclude <arg>...|--include <arg>...] [--cutoff <n>]
-  brontosaurus_plotter.py releasebars  ( text | stacked | share | split ) <dataset> <dataseries> [--exclude <arg>...|--include <arg>...] [--cutoff <n>]
+  brontosaurus_plotter.py releasebar  ( text | stacked | share | split ) <dataset> <dataseries> [--exclude <arg>...|--include <arg>...] [--cutoff <n>]
 
 Options:
   --exclude <arg>...  In the form `dataseries:element`. Can repeat.
@@ -57,7 +57,7 @@ def main():
 
     cur = database.cursor()
     cur.execute("SELECT DATE(week, '6 days') FROM checkins ORDER BY week DESC LIMIT 1")
-    timestamp = cur.fetchone()
+    timestamp = cur.fetchone()[0]
 
     params = config['graph_defaults'].copy()
     params['timestamp'] = timestamp
@@ -94,15 +94,17 @@ def main():
     
     if arguments['timeseries']:
         params['type'] = 'timeseries'
-    elif arguments['releasebars']:
-        params['type'] = 'releasebars'
+    elif arguments['releasebar']:
+        params['type'] = 'releasebar'
     # TODO: waffle!
 
     # read in defaults from config.toml
     params.update(config[params['type']])
 
-    table = params['table']
+    #pprint(params)
+
 
+    table = params['table']
 
     # maybe docopt isn't the best choice. oh well.
     for graphtype in [ 'text', 'stacked', 'share', 'split', 'line' ]:
@@ -142,8 +144,9 @@ def main():
                       ).astype("Int64")
 
     # Smooth over any missing data
-    #dataframe.resample('W-MON')
-    pprint(dataframe    )
+    #if xaxis=='week':
+    #   dataframe.resample('W-MON')
+    #pprint(dataframe)
 
     #################
     # Instead of this, accumulate anything more than 10 into "other"
@@ -168,7 +171,7 @@ def main():
     subplots = False
 
     match params['type']:
-        case 'releasebars':
+        case 'releasebar':
             kind = 'bar'
         case 'timeseries':
             kind = 'area'
@@ -195,7 +198,7 @@ def main():
             stacked = False # true everywhere else!
             colormap = cmap
         case 'split':
-            """ This is releasebars-only. """
+            """ This is releasebar-only. """
             df = dataframe[startdate:]
             colormap = cmap
             subplots = True
@@ -227,11 +230,16 @@ def main():
                 bbox_to_anchor=(1.0, 0.5))
 
     madlibs = {'dataseries': dataseries,
-                'dataset': dataset,
-                'graph': params['graph'],
-                'dataseries_label': config['dataseries_labels'][dataseries],
-                'dataset_label': config['dataset_labels'][dataset],
-                'view_label': config['view_labels'][params['graph']]}
+               'dataset': dataset,
+               'graph': params['graph'],
+               'timestamp': timestamp,
+               'type': params['type'],
+               'extra': '',
+               'dataseries_label': config['dataseries_labels'][dataseries],
+               'dataset_label': config['dataset_labels'][dataset],
+               'view_label': config['view_labels'][params['graph']],
+               'type_label': params['label'],
+              }
 
     if 'title' in params:
         plt.suptitle(Template(params['title']).safe_substitute(madlibs),
diff --git a/config.toml b/config.toml
index 6676546..7e31ead 100644
--- a/config.toml
+++ b/config.toml
@@ -33,7 +33,6 @@ colors = [
 ]
 
 # could be png, pdf, svg
-# TODO: not yet implemented
 image_types = ["png"]
 
 [startdate]
@@ -71,16 +70,20 @@ age="age category"
 [graph_defaults]
 table="checkins"
 title="$dataset_label: $dataseries_label $type_label"
-filebase="$dataset-timeseries-$dataseries-$graph"
-# TODO: list options!
+filebase="$timestamp-$dataset-$type-$graph-$dataseries"
+# TODO: list possible options!
 
 [timeseries]
-subtitle="weekly checkins"
+label="weekly checkins"
+subtitle=""
 xaxis = "week"
 extraselect=""
 
-[releasebars]
+[releasebar]
+label="by release"
 table="peak"
 xaxis = "release"
 subtitle="data for each release taken from the week of that release's (current) peak"
 
+[waffleplot]
+subtitle="tk!"
\ No newline at end of file
diff --git a/run.sh b/run.sh
index a8d85a0..d150d1e 100755
--- a/run.sh
+++ b/run.sh
@@ -90,10 +90,21 @@ echo -n "* Painting the feathers..."
 echo "  vibrant!"
 
 echo "* Drawing portraits from the fossilized remains... "
- LINES=$(toml -dj < config.toml |jq '.timeseries[].views.value[].value'|wc -l)
- ./brontosaurus_plotter.py | pv -F "  %p %e" -w60 -l -s $LINES > /dev/null
-  if [[ $? != 0 ]]; then
-    echo "! Oops."
-    exit 1
-  fi  
-echo "  Beautiful."
+LINES=$(toml -dj < config.toml |jq '.timeseries[].views.value[].value'|wc -l)
+ #./brontosaurus_plotter.py | pv -F "  %p %e" -w60 -l -s $LINES > /dev/null
+  
+ #if [[ $? != 0 ]]; then
+ #   echo "! Oops."
+ #   exit 1
+ # fi
+ for dataset in fedora_updates_systems epel; do
+  for dataseries in age arch release variant; do
+   for graph in stacked share line; do
+    ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries
+   done
+   for graph in stacked share split; do
+    ./brontosaurus_plotter.py releasebar $graph $dataset $dataseries
+   done
+  done
+ done
+ echo "  Beautiful."

From c0b265a80a169c4844860f88bba54fa997e05d22 Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 25 2022 00:11:32 +0000
Subject: [PATCH 25/49] basically works


---

diff --git a/TODO.md b/TODO.md
index 533cc45..03b7537 100644
--- a/TODO.md
+++ b/TODO.md
@@ -3,16 +3,25 @@
  1. Save the color mappings to a file as a separate step
     * using defaults from config (DONE)
  2. change brontosaurus-plotter to render _one_ image per call
+    * first pass done
     * with a syntax for what to include or exclude by name
     * and possibly with some number options?
- 3. make timeline, releasebar, and waffle be separate commands
- 4. have some script that pre-renders some defaults
+    * add back ephemeral, or is that just a subset of the above?
+ 3. make timeline, releasebar, and waffle be separate commands (DONE)
+ 4. have some script that pre-renders some defaults (PARTIAL)
  5. and a simple front-end for exploring the rest
 
-* put the dataset date in the filename!
+* map "generic" and "unknown" and "none" to "unspecified"
+
 
-* epel -- need to special-case EL 8 by-release graphs to add peak _after_
-  CentOS Linux 8 EOL
+* better ordering
+
+* epel -- need to special-case EL 8 by-release graphs to add peak for
+  both before and after CentOS Linux 8 EOL
+
+* fix it so colors don't overlap when there's more than 12 options.
+  Best bet: lump everything after 11 into "other".
+  * change the timeseries "hide" to collect small things into "other"
 
 * text reports!!!
   * this week / last week / year-over-year
@@ -28,13 +37,11 @@
      * architecture as above
      * share of category (desktop/server-cloud-iot/labs)
 
-* use jinjasql for the query templates!
+* use jinjasql for the query templates?
 * sanitize everything coming from config.toml, really.
 
 * for the slicer, put the groups in their definitions in the config.toml
 
-* better ordering
-
 * secondary timeline charts for variants:
 
     * variant variants!
@@ -50,20 +57,7 @@
 * Report estimating new installs vs upgrades (number of systems older than
   the release itself ... need to factor in beta releaes date, etc....)
 
-* I guess we should make it so the timeseries definitions can loop over multiple datasets to avoid
-  a lot of redundancy. Or at least, to apply to all Fedora datasets? (Yes, that: introduce a "distro" grouping.)
-  - need a way to actually include multiple datasets at once though, like for the fedora linux + epel graph
-
-* predefined colors for some things
-
-* fix it so colors don't overlap when there's more than 12 options.
-  Best bet: lump everything after 11 into "other".
-
-* stacked bar charts for each release with age, arch, variant
-
-  * these stacked bar charts should feature each release at its peak
-    point, not summed (because that's its most interesting!)
-  * don't bother with ephemeral/persistent view (age view is enough)
+* need a way to include multiple datasets at once, like for the fedora linux + epel graph
 
 * sanatize all values read from config.toml
 
@@ -77,10 +71,7 @@
 
 * make animations by week of full [arch,variant,release]
   * maybe of the breakouts too?
-
     
-* change the timeseries "hide" to collect small things into "other"
-
 
 * the "age" charts are most interesting on a by-week basis, but _per
   release_. Can we estimate the flow from release-to-release? (Answer:
@@ -124,7 +115,6 @@
 * fix the code in brotosaurus washer to merge '' to 'none' rather than just
   renaming (works now because there are no natural 'none' entries).
 
-* map "generic" and "unknown" and "none" to "unspecified"
 
 * instead of throwing away entries in the washing phase (especially those
   below thresholds), write them to a special db for "fun" analysis
diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py
index 0b8121f..7621f9d 100755
--- a/brontosaurus_plotter.py
+++ b/brontosaurus_plotter.py
@@ -3,8 +3,8 @@
 Brontosaurus Plotter
 
 Usage:
-  brontosaurus_plotter.py timeseries ( text | stacked | share | line )  <dataset> <dataseries> [--exclude <arg>...|--include <arg>...] [--cutoff <n>]
-  brontosaurus_plotter.py releasebar  ( text | stacked | share | split ) <dataset> <dataseries> [--exclude <arg>...|--include <arg>...] [--cutoff <n>]
+  brontosaurus_plotter.py timeseries ( text | stacked | share | line ) <dataset> <dataseries> [--exclude <arg>...|--include <arg>...] [--cutoff <n>]
+  brontosaurus_plotter.py releasebar ( text | stacked | share )        <dataset> <dataseries> [--exclude <arg>...|--include <arg>...] [--cutoff <n>]
 
 Options:
   --exclude <arg>...  In the form `dataseries:element`. Can repeat.
@@ -66,9 +66,10 @@ def main():
     datasets = [t[0] for t in cur.fetchall()]
     cur.execute("SELECT * FROM checkins LIMIT 1").fetchall()
     dataserieses = [t[0] for t in cur.description]
-    dataserieses.remove('week')
+   
     dataserieses.remove('dataset')
     dataserieses.remove('hits')
+    dataserieses.remove('week')
 
     #pprint(arguments)
 
@@ -77,16 +78,16 @@ def main():
     dataseries=arguments['<dataseries>']
 
     if not re.match('^[0-9a-z_]*$', dataset):
-        print(f"Bad dataset name! '%{dataset}")
+        print(f"Bad dataset name! '{dataset}")
         exit(1)
     if not dataset in datasets:
-        print(f"Dataset '%{dataset}' not in database.")
+        print(f"Dataset '{dataset}' not in database.")
         exit(1)
     if not re.match('^[0-9a-z_]*$', dataseries):
-        print(f"Bad dataseries name! '%{dataseries}")
+        print(f"Bad dataseries name! '{dataseries}")
         exit(1)
     if not dataseries in dataserieses:
-        print(f"Dataseries '%{dataseries}' not in database.")
+        print(f"Dataseries '{dataseries}' not in database.")
         exit(1)       
 
     startdate = config['startdate'][dataset]
@@ -96,6 +97,9 @@ def main():
         params['type'] = 'timeseries'
     elif arguments['releasebar']:
         params['type'] = 'releasebar'
+        if dataseries == 'release':
+            print("Plotting release by release makes no sense.")
+            exit(1)
     # TODO: waffle!
 
     # read in defaults from config.toml
@@ -107,7 +111,7 @@ def main():
     table = params['table']
 
     # maybe docopt isn't the best choice. oh well.
-    for graphtype in [ 'text', 'stacked', 'share', 'split', 'line' ]:
+    for graphtype in [ 'text', 'stacked', 'share', 'line' ]:
         if arguments[graphtype]:
             params['graph']=graphtype
             break
@@ -197,11 +201,11 @@ def main():
             kind = 'line' # overrides 'area'
             stacked = False # true everywhere else!
             colormap = cmap
-        case 'split':
-            """ This is releasebar-only. """
-            df = dataframe[startdate:]
-            colormap = cmap
-            subplots = True
+        #case 'split':
+        #    """ This is releasebar-only. """
+        #    df = dataframe[startdate:]
+        #    colormap = cmap
+        #    subplots = True
             
 
@@ -245,7 +249,7 @@ def main():
         plt.suptitle(Template(params['title']).safe_substitute(madlibs),
                         fontsize=24)
 
-    # FIX: make work
+    # FIX: make work    
     if 'subtitle' in params:
         graph.set_title(
             Template(params['subtitle']).safe_substitute(madlibs),
diff --git a/config.toml b/config.toml
index 7e31ead..b2b9086 100644
--- a/config.toml
+++ b/config.toml
@@ -54,6 +54,7 @@ arch="CPU architecture"
 release="release "
 variant="variant"
 age="age category"
+week="per week"
 
 [age_labels]
 '0'='Ephemeral'
@@ -66,11 +67,13 @@ age="age category"
 'line'=""
 'stacked'=" (stacked)"
 'share'=" (share)"
+'split'=""
 
 [graph_defaults]
 table="checkins"
 title="$dataset_label: $dataseries_label $type_label"
-filebase="$timestamp-$dataset-$type-$graph-$dataseries"
+subtitle="$extra"
+filebase="$timestamp-$dataset-$type-$dataseries$extra-$graph"
 # TODO: list possible options!
 
 [timeseries]
diff --git a/run.sh b/run.sh
index d150d1e..54dc26b 100755
--- a/run.sh
+++ b/run.sh
@@ -87,10 +87,10 @@ echo "  Built!"
 echo -n "* Painting the feathers..."
  rm db/color-cache.toml 2> /dev/null
  ./brontosaurus_colorizer.py
-echo "  vibrant!"
+echo "  Vibrant!"
 
 echo "* Drawing portraits from the fossilized remains... "
-LINES=$(toml -dj < config.toml |jq '.timeseries[].views.value[].value'|wc -l)
+ #LINES=$(toml -dj < config.toml |jq '.timeseries[].views.value[].value'|wc -l)
  #./brontosaurus_plotter.py | pv -F "  %p %e" -w60 -l -s $LINES > /dev/null
   
  #if [[ $? != 0 ]]; then
@@ -100,9 +100,15 @@ LINES=$(toml -dj < config.toml |jq '.timeseries[].views.value[].value'|wc -l)
  for dataset in fedora_updates_systems epel; do
   for dataseries in age arch release variant; do
    for graph in stacked share line; do
+    echo ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries
+    echo -n "  "
     ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries
    done
-   for graph in stacked share split; do
+  done
+  for dataseries in age arch variant; do
+   for graph in stacked share; do
+    echo ./brontosaurus_plotter.py releasebar $graph $dataset $dataseries
+    echo -n "  "
     ./brontosaurus_plotter.py releasebar $graph $dataset $dataseries
    done
   done

From cb50f9c5aa0f795e1ffd897604d6dd770e18815f Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 25 2022 00:41:34 +0000
Subject: [PATCH 26/49] map the unknowns to one place


---

diff --git a/TODO.md b/TODO.md
index 03b7537..6eb6283 100644
--- a/TODO.md
+++ b/TODO.md
@@ -11,9 +11,6 @@
  4. have some script that pre-renders some defaults (PARTIAL)
  5. and a simple front-end for exploring the rest
 
-* map "generic" and "unknown" and "none" to "unspecified"
-
-
 * better ordering
 
 * epel -- need to special-case EL 8 by-release graphs to add peak for
@@ -112,10 +109,6 @@
   estimate that there's probably 50,000 systems out there running Fedora 20 or
   older). Let's not forget those!
 
-* fix the code in brotosaurus washer to merge '' to 'none' rather than just
-  renaming (works now because there are no natural 'none' entries).
-
-
 * instead of throwing away entries in the washing phase (especially those
   below thresholds), write them to a special db for "fun" analysis
 
diff --git a/brontosaurus_washer.sh b/brontosaurus_washer.sh
index b655cb7..f4126aa 100755
--- a/brontosaurus_washer.sh
+++ b/brontosaurus_washer.sh
@@ -13,23 +13,32 @@
 # The variant "CentOS Stream v21.*" is some sort of horrible scripted thing
 # apparently, where it keeps getting longer and longer with additional tags
 #
-# Also, for each table, sets any variant that is '' to 'none', because
-# '' is hard to work with. (I think this is when people have manually put
-# "VARIANT_ID=", as opposed to not having one. I don't think that's useful
-# to track separately from 'generic', really, so an alternative would be
-# to merge them... but doing this for now.)
-# FIXME: this needs to merge them in case someone starts actually sending
-# "none" as the string — we'll get a uniqueness constraint violation.
+
 #
 # This is a regex, in case that's not clear.
 GARBAGE_VARIANTS="^(09975472-cc15-4020-9231-cc6743a15b0f|c1e9af0e-5816-4644-9c89-703aa1fdcf04|CentOS Stream v21.*)$" 
 
 sqlite3 db/bronto.db << EOF
   DELETE FROM checkins WHERE variant REGEXP "$GARBAGE_VARIANTS";
-  UPDATE checkins SET variant='none' WHERE variant='';
 EOF
 
 
+# Change generic, unknown, and '' to all be "unspecified"
+# (I think this is when people have manually put "VARIANT_ID=", as opposed
+# to not having one. I don't think that's useful
+# to track separately from 'generic', really, so an alternative would be
+# to merge them... but doing this for now.)
+# FIXME: this needs to merge them in case someone starts actually sending
+# "none" as the string — we'll get a uniqueness constraint violation.
+sqlite3 db/bronto.db << EOF
+  BEGIN;
+  INSERT INTO checkins SELECT week,dataset,release,"unspecified",arch,age,sum(hits) FROM checkins
+    WHERE variant='generic' or variant='none' or variant='unknown'
+    GROUP BY week,release,arch,age;
+  DELETE FROM checkins WHERE variant='generic' or variant='none' or variant='unknown';
+  COMMIT; 
+EOF
+
 # While some test systems ran Fedora Linux 31, the feature landed
 # in 32 (released 2020-04-27, so drop all the old stuff.
 FEDORA_STARTVER=32
diff --git a/run.sh b/run.sh
index 54dc26b..d92cf15 100755
--- a/run.sh
+++ b/run.sh
@@ -100,15 +100,11 @@ echo "* Drawing portraits from the fossilized remains... "
  for dataset in fedora_updates_systems epel; do
   for dataseries in age arch release variant; do
    for graph in stacked share line; do
-    echo ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries
-    echo -n "  "
     ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries
    done
   done
   for dataseries in age arch variant; do
    for graph in stacked share; do
-    echo ./brontosaurus_plotter.py releasebar $graph $dataset $dataseries
-    echo -n "  "
     ./brontosaurus_plotter.py releasebar $graph $dataset $dataseries
    done
   done

From 195d221a60583707905b5f93598d3bfe390677b6 Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 25 2022 13:02:26 +0000
Subject: [PATCH 27/49] chonky bars.


---

diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py
index 7621f9d..893027c 100755
--- a/brontosaurus_plotter.py
+++ b/brontosaurus_plotter.py
@@ -3,17 +3,17 @@
 Brontosaurus Plotter
 
 Usage:
-  brontosaurus_plotter.py timeseries ( text | stacked | share | line ) <dataset> <dataseries> [--exclude <arg>...|--include <arg>...] [--cutoff <n>]
-  brontosaurus_plotter.py releasebar ( text | stacked | share )        <dataset> <dataseries> [--exclude <arg>...|--include <arg>...] [--cutoff <n>]
+  brontosaurus_plotter.py timeseries ( text | stacked | share | line ) <dataset> <dataseries> [--exclude <arg>...|--include <arg>...] [options]
+  brontosaurus_plotter.py releasebar ( text | stacked | share )        <dataset> <dataseries> [--exclude <arg>...|--include <arg>...] [options]
 
 Options:
   --exclude <arg>...  In the form `dataseries:element`. Can repeat.
   --include <arg>...  As above, but include _only_ these.
   --cutoff <n>        Drop items where the dataseries has less than n total hits
+  --output <arg>      Optional output filename (overrides config!)
 """
 
 #import matplotlib.dates as dates
-import matplotlib.pyplot as plt
 
 import sqlite3
 from string import Template
@@ -26,6 +26,7 @@ import toml
 import re
 
 import matplotlib as m
+import matplotlib.pyplot as plt
 
 from docopt import docopt
 
@@ -46,6 +47,10 @@ m.rcParams['legend.frameon'] = False
 
 def main():
 
+    # TODO: separate initialization (leave in main) from rendering one
+    # image (or a related set)... put that in a function. Then we can
+    # draw a bunch of similar files with less overhead.
+
     arguments = docopt(__doc__, version='0.1')
 
 
@@ -172,7 +177,6 @@ def main():
 
 
     stacked = True
-    subplots = False
 
     match params['type']:
         case 'releasebar':
@@ -209,18 +213,30 @@ def main():
             
 
-    # Start the actual graph
-    graph = df.plot(figsize=config['figsize'],
-                    colormap=colormap,
-                    kind=kind,
-                    stacked=stacked,
-                    subplots=subplots)
+    # Start the actual graph.
+    # same for both kinds, except width
+    match kind:
+        case 'bar':
+            graph = df.plot(figsize=config['figsize'],
+                            colormap=colormap,
+                            kind=kind,
+                            stacked=stacked,
+                            width=0.95
+                        )
+        case _:
+            graph = df.plot(figsize=config['figsize'],
+                            colormap=colormap,
+                            kind=kind,
+                            stacked=stacked,
+                        )
 
     # Labels and titles and stuff.
     ax = plt.gca()
 
     handles, labels = ax.get_legend_handles_labels()
 
+
+
     # TODO: generalize this
     if dataseries == 'age':
         labels = list(map(config['age_labels'].get, labels))
@@ -260,16 +276,24 @@ def main():
     graph.set_ylim([0, None])
     graph.spines['right'].set_visible(False)
     graph.spines['top'].set_visible(False)
-    sFormatter = m.ticker.ScalarFormatter()
-    sFormatter.set_scientific(False)
-    graph.yaxis.set_major_formatter(sFormatter)
+    graph.yaxis.set_major_formatter(m.ticker.EngFormatter(sep=''))
     # graph.xaxis.set_major_formatter(dates.DateFormatter('%b \'%y'))
     graph.set_xlabel('')
 
+    # aesthetic pickiness!
+    if kind == 'bar':
+        ax.tick_params(bottom=False)
+
+
+    # Not sure why these get rotated by default. Unrotate them!
+    plt.xticks(rotation = 0)
+
     for ext in config['image_types']:
         graph.figure.savefig(f"images/{ext}/{dataset}/{Template(params['filebase']).safe_substitute(madlibs)}.{ext}",
                                 dpi=config['dpi'], bbox_inches="tight")
 
+    graph.figure.savefig(f"images/test.png", dpi=config['dpi'], bbox_inches="tight")
+
     plt.close(graph.figure)
     print(f"{Template(params['filebase']).safe_substitute(madlibs)}.{ext}")
 
diff --git a/brontosaurusifier_utils/colormapping.py b/brontosaurusifier_utils/colormapping.py
index c338a07..7611c71 100644
--- a/brontosaurusifier_utils/colormapping.py
+++ b/brontosaurusifier_utils/colormapping.py
@@ -31,7 +31,7 @@ def load_color_cache(cachefile,presetfile):
     try:
         cached = toml.load(cachefile)
     except FileNotFoundError:
-        print(f"Can't open color cache {cachefile}, so starting fresh.")
+        #print(f"Can't open color cache {cachefile}, so starting fresh.")
         cached = {}
 
     try:
diff --git a/color-presets.toml b/color-presets.toml
index 8c0daf7..76f12bc 100644
--- a/color-presets.toml
+++ b/color-presets.toml
@@ -6,3 +6,15 @@
 "AlmaLinux" = "#ffcc0a"
 "Oracle Linux Server" = "#101010"
 "CloudLinux" = "#0097f3"
+
+["fedora_updates_systems.variant"]
+"unspecfied" = "#808080"
+
+["fedora_updates_containers.variant"]
+"unspecfied" = "#808080"
+
+["fedora_rawhide_systems.variant"]
+"unspecfied" = "#808080"
+
+["fedora_rawhide_containers.variant"]
+"unspecfied" = "#808080"
diff --git a/run.sh b/run.sh
index d92cf15..d2cb1b6 100755
--- a/run.sh
+++ b/run.sh
@@ -36,7 +36,7 @@ datafreshness
 
 echo -n "* Stomping intermediate files... "
   rm db/bronto.db 2> /dev/null
-echo "  extinct."
+echo " extinct."
 
 echo -n "* Fossilizing ancient images... "
   mkdir -p images/{svg,png}
@@ -52,7 +52,7 @@ echo -n "* Slicing brontosauruses... "
     echo "! Oops."
     exit 1
   fi  
-echo "  into bits."
+echo "      into bits."
 
 echo -n "* Scrubbing off the dirt... "
   ./brontosaurus_washer.sh
@@ -60,7 +60,7 @@ echo -n "* Scrubbing off the dirt... "
     echo "! Oops."
     exit 1
   fi  
-echo "  shiny!"
+echo "       shiny!"
 
 echo -n "* Finding the strongest... "
   ./brontosaurus_fight.sh
@@ -68,7 +68,7 @@ echo -n "* Finding the strongest... "
     echo "! Oops."
     exit 1
   fi  
-echo "  rarrhhhhr!"
+echo "       rarrhhhhr!"
 
 echo -n "* Sorting the eggs... "
   ./brontosaurus_egg-sorter.py
@@ -76,18 +76,18 @@ echo -n "* Sorting the eggs... "
     echo "! Oops."
     exit 1
   fi  
-echo "  binaried."
+echo "           binaried."
 
-echo "* Creating cages for different exhibits..."
+echo -n "* Creating exhibit cages..." 
   for dataset in $(echo 'SELECT DISTINCT(dataset) FROM checkins;' | sqlite3 ./db/bronto.db); do
     mkdir -p images/{svg,png}/$dataset
   done
-echo "  Built!"
+echo "    built!"
 
 echo -n "* Painting the feathers..."
  rm db/color-cache.toml 2> /dev/null
  ./brontosaurus_colorizer.py
-echo "  Vibrant!"
+echo "       vibrant!"
 
 echo "* Drawing portraits from the fossilized remains... "
  #LINES=$(toml -dj < config.toml |jq '.timeseries[].views.value[].value'|wc -l)

From 265ff1f59cfc4a7afcd13aa87f72fa11081b0eed Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 25 2022 13:24:21 +0000
Subject: [PATCH 28/49] oops actually get ''


---

diff --git a/brontosaurus_washer.sh b/brontosaurus_washer.sh
index f4126aa..36bc013 100755
--- a/brontosaurus_washer.sh
+++ b/brontosaurus_washer.sh
@@ -33,9 +33,9 @@ EOF
 sqlite3 db/bronto.db << EOF
   BEGIN;
   INSERT INTO checkins SELECT week,dataset,release,"unspecified",arch,age,sum(hits) FROM checkins
-    WHERE variant='generic' or variant='none' or variant='unknown'
+    WHERE variant='generic' or variant='' or variant='none' or variant='unknown'
     GROUP BY week,release,arch,age;
-  DELETE FROM checkins WHERE variant='generic' or variant='none' or variant='unknown';
+  DELETE FROM checkins WHERE variant='generic' or variant='' or variant='none' or variant='unknown';
   COMMIT; 
 EOF
 

From 54a3210dd4829227408f1e163dd283685cb9f93a Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 25 2022 16:13:12 +0000
Subject: [PATCH 29/49] "other" works


---

diff --git a/TODO.md b/TODO.md
index 6eb6283..c54c9ef 100644
--- a/TODO.md
+++ b/TODO.md
@@ -16,9 +16,6 @@
 * epel -- need to special-case EL 8 by-release graphs to add peak for
   both before and after CentOS Linux 8 EOL
 
-* fix it so colors don't overlap when there's more than 12 options.
-  Best bet: lump everything after 11 into "other".
-  * change the timeseries "hide" to collect small things into "other"
 
 * text reports!!!
   * this week / last week / year-over-year
diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py
index 893027c..6de8639 100755
--- a/brontosaurus_plotter.py
+++ b/brontosaurus_plotter.py
@@ -32,8 +32,6 @@ from docopt import docopt
 
 from brontosaurusifier_utils import colormapping
 
-DATAFILE = 'db/bronto.db'
-
 m.use("Agg")
 
 m.style.use('seaborn-colorblind')
@@ -58,7 +56,7 @@ def main():
 
     colormappings = colormapping.load_color_cache(config['color_cache'],config['color_presets'])
 
-    database = sqlite3.connect(DATAFILE, detect_types=sqlite3.PARSE_DECLTYPES)
+    database = sqlite3.connect(config['datafile'], detect_types=sqlite3.PARSE_DECLTYPES)
 
     cur = database.cursor()
     cur.execute("SELECT DATE(week, '6 days') FROM checkins ORDER BY week DESC LIMIT 1")
@@ -147,30 +145,44 @@ def main():
     df = pd.read_sql_query(query, parse_dates='week', con=database)
 
 
-    dataframe=df.pivot(index=xaxis,
+    datatable=df.pivot(index=xaxis,
                        columns=dataseries,
                        values='hits'
                       ).astype("Int64")
 
     # Smooth over any missing data
     #if xaxis=='week':
-    #   dataframe.resample('W-MON')
-    #pprint(dataframe)
+    #   datatable.resample('W-MON')
+    #pprint(datatable)
 
     #################
-    # Instead of this, accumulate anything more than 10 into "other"
-    # ... and do it _elsewhere_ (easier to do before pivot anyway!)
-    #  + limit number of columns to 10 + other
-
-    hidelist = dataframe.div(dataframe.sum(axis=1), axis=0).max() < 0.2/100
-    dataframe.drop(columns=(hidelist[hidelist == True].keys()), inplace=True)
+    # Find the items below thresholds for percent in any given
+    # dataseries entry, and also for excess number of items.
+    # Bin 'em into "other"
+    # TODO: weight this towards the end of the data, so we don't drop
+    # emerging interesting things in favor of old news?
+    toosmall = datatable.div(datatable.sum(axis=1), axis=0).max() < config['minpercent'] / 100
+    others = toosmall[toosmall == True].keys()
+    othercol = datatable[others].sum(axis=1).astype("Int64")
+    datatable.drop(columns=others, inplace=True)
+
+    # still too big?
+    if len(datatable.columns) > config['maxitems']:
+        # the -1 in `config['maxitems']-1` is so we don't exceed the
+        # limit by adding the "others" column!
+        others = datatable.sum().sort_values(ascending=False)[config['maxitems']-1:].index
+        othercol += datatable[others].sum(axis=1).astype("Int64")
+        datatable.drop(columns=others, inplace=True)
+
+    if othercol.any():
+        datatable['other'] = othercol
 
     ##################
     # our colors.
     # the complication here is keeping the same color for the same label
     # across multiple graphs!
     cmap = m.colors.ListedColormap(colormapping.get_colors(colormappings, config['colors'],
-                                              dataset, dataseries, list(dataframe.columns)))
+                                              dataset, dataseries, list(datatable.columns)))
 
     ##################
     # and now.... graph it!
@@ -186,28 +198,28 @@ def main():
     
     match params['graph']:
         case 'stacked':
-            df = dataframe[startdate:][dataframe.columns[::-1]]
+            df = datatable[startdate:][datatable.columns[::-1]]
             colormap = m.colors.ListedColormap(cmap.colors[::-1])
         case 'share':
             if dataseries == 'age':
                 # lower numbers are newer!
-                df = dataframe[startdate:][dataframe.columns[::-1]].div(
-                    dataframe.sum(axis=1), axis=0)*100
+                df = datatable[startdate:][datatable.columns[::-1]].div(
+                    datatable.sum(axis=1), axis=0)*100
                 colormap = m.colors.ListedColormap(cmap.colors[::-1])
             else:
                 # todo: sort arch and variant by popularity, not name!
-                df = dataframe[startdate:].div(
-                    dataframe.sum(axis=1), axis=0)*100
+                df = datatable[startdate:].div(
+                    datatable.sum(axis=1), axis=0)*100
                 colormap = cmap
         case 'line':
             """ This is timeseries-only. """
-            df = dataframe[startdate:]
+            df = datatable[startdate:]
             kind = 'line' # overrides 'area'
             stacked = False # true everywhere else!
             colormap = cmap
         #case 'split':
         #    """ This is releasebar-only. """
-        #    df = dataframe[startdate:]
+        #    df = datatable[startdate:]
         #    colormap = cmap
         #    subplots = True
             
@@ -292,7 +304,7 @@ def main():
         graph.figure.savefig(f"images/{ext}/{dataset}/{Template(params['filebase']).safe_substitute(madlibs)}.{ext}",
                                 dpi=config['dpi'], bbox_inches="tight")
 
-    graph.figure.savefig(f"images/test.png", dpi=config['dpi'], bbox_inches="tight")
+    #graph.figure.savefig(f"images/test.png", dpi=config['dpi'], bbox_inches="tight")
 
     plt.close(graph.figure)
     print(f"{Template(params['filebase']).safe_substitute(madlibs)}.{ext}")
diff --git a/brontosaurusifier_utils/colormapping.py b/brontosaurusifier_utils/colormapping.py
index 7611c71..04502ae 100644
--- a/brontosaurusifier_utils/colormapping.py
+++ b/brontosaurusifier_utils/colormapping.py
@@ -19,7 +19,10 @@ def get_colors(colormappings, colorlist, dataset, dataseries, items):
     outcolors = []
     for item in items:
         if str(item) not in colormappings[key]:
-            colormappings[key][str(item)] = colorlist[len(colormappings[key]) % len(colorlist)]
+            if item == 'other':
+                colormappings[key][str(item)] = colorlist[-1]
+            else:
+                colormappings[key][str(item)] = colorlist[len(colormappings[key]) % len(colorlist)]
         outcolors.append(colormappings[key][str(item)])
 
     return outcolors
diff --git a/color-presets.toml b/color-presets.toml
index 76f12bc..6aefa12 100644
--- a/color-presets.toml
+++ b/color-presets.toml
@@ -1,5 +1,5 @@
 ["epel.variant"]
-"CentOS Linux" = "#808080"
+"CentOS Linux" = "#a0a0a0"
 "Red Hat Enterprise Linux" = "#ee0000"
 "CentOS Stream" = "#a14a8c"
 "Rocky Linux" = "#10b981"
@@ -8,13 +8,13 @@
 "CloudLinux" = "#0097f3"
 
 ["fedora_updates_systems.variant"]
-"unspecfied" = "#808080"
+"unspecified" = "#cccccc"
 
 ["fedora_updates_containers.variant"]
-"unspecfied" = "#808080"
+"unspecified" = "#cccccc"
 
 ["fedora_rawhide_systems.variant"]
-"unspecfied" = "#808080"
+"unspecified" = "#cccccc"
 
 ["fedora_rawhide_containers.variant"]
-"unspecfied" = "#808080"
+"unspecified" = "#cccccc"
diff --git a/config.toml b/config.toml
index b2b9086..cf12d38 100644
--- a/config.toml
+++ b/config.toml
@@ -8,12 +8,24 @@ imagepath="images/$filetype/$dataset/"
 figsize = [16, 9]
 dpi = 300
 
+# Entries where the highest value for any
+# week (or release) is not above this 
+# percent of the total for that week (or
+# release) will be binned together into
+# "other"
+minpercent = 0.5
 
-# Our palette. Note that this also limits the
-# number of items per chart. If there are
-# more than the number of colors, the last
-# color here becomes "other".
-# (TODO! Implement that!)
+
+# Also bin excess entries with "other".
+# Note that this limit *does* include the
+# "other" line, if any.
+maxitems = 10
+
+# Our palette. Note that if `maxitems`
+# is greater than the number of options in
+# this list, it will cycle around!
+# Also note: "other" is automatically the
+# _last_ color in the list.
 colors = [
     '#51a2da',
     '#294172',
@@ -29,7 +41,6 @@ colors = [
     '#aad0ee',
     '#101010',
     '#535961',
-    '#808080',
 ]
 
 # could be png, pdf, svg
diff --git a/run.sh b/run.sh
index d2cb1b6..e1cb461 100755
--- a/run.sh
+++ b/run.sh
@@ -36,7 +36,7 @@ datafreshness
 
 echo -n "* Stomping intermediate files... "
   rm db/bronto.db 2> /dev/null
-echo " extinct."
+echo "extinct."
 
 echo -n "* Fossilizing ancient images... "
   mkdir -p images/{svg,png}
@@ -44,7 +44,7 @@ echo -n "* Fossilizing ancient images... "
   rm images/png/* 2> /dev/null
   rm images/svg/*/* 2> /dev/null
   rm images/png/*/* 2> /dev/null
-echo "  buried."
+echo " buried."
 
 echo -n "* Slicing brontosauruses... "
   ./brontosaurus_slicer.sh
@@ -52,7 +52,7 @@ echo -n "* Slicing brontosauruses... "
     echo "! Oops."
     exit 1
   fi  
-echo "      into bits."
+echo "     into bits."
 
 echo -n "* Scrubbing off the dirt... "
   ./brontosaurus_washer.sh
@@ -60,7 +60,7 @@ echo -n "* Scrubbing off the dirt... "
     echo "! Oops."
     exit 1
   fi  
-echo "       shiny!"
+echo "     shiny!"
 
 echo -n "* Finding the strongest... "
   ./brontosaurus_fight.sh
@@ -68,7 +68,7 @@ echo -n "* Finding the strongest... "
     echo "! Oops."
     exit 1
   fi  
-echo "       rarrhhhhr!"
+echo "      rarrhhhhr!"
 
 echo -n "* Sorting the eggs... "
   ./brontosaurus_egg-sorter.py
@@ -82,7 +82,7 @@ echo -n "* Creating exhibit cages..."
   for dataset in $(echo 'SELECT DISTINCT(dataset) FROM checkins;' | sqlite3 ./db/bronto.db); do
     mkdir -p images/{svg,png}/$dataset
   done
-echo "    built!"
+echo "      built!"
 
 echo -n "* Painting the feathers..."
  rm db/color-cache.toml 2> /dev/null

From 1cc61ede81f9d3c569e47f787dc266d48de4cdbc Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 25 2022 16:23:57 +0000
Subject: [PATCH 30/49] wash better


---

diff --git a/brontosaurus_washer.sh b/brontosaurus_washer.sh
index 36bc013..217dd88 100755
--- a/brontosaurus_washer.sh
+++ b/brontosaurus_washer.sh
@@ -1,9 +1,32 @@
 #!/bin/bash
 #
-# For every table in bronto.db, delete "known bad" variants.
-#
 
 
+function counthits() {
+  true
+  #echo 'SELECT sum(hits) FROM checkins' | sqlite3 db/bronto.db;
+}
+
+counthits
+
+# While some test systems ran Fedora Linux 31, the feature landed
+# in 32 (released 2020-04-27, so drop all the old stuff.
+FEDORA_STARTVER=32
+FEDORA_STARTDAY='2021-01-01'
+# And same for EPEL, with EL 8 (DNF feature launched in 8.3 at end of 2020)
+EPEL_STARTVER=8
+EPEL_STARTDAY='2021-01-01'
+sqlite3 db/bronto.db << EOF
+  DELETE FROM checkins WHERE dataset GLOB "fedora*" AND release < $FEDORA_STARTVER;
+  DELETE FROM checkins WHERE dataset GLOB "fedora*" AND week < "$FEDORA_STARTDAY";
+  DELETE FROM checkins WHERE dataset GLOB "epel*"   AND release < $EPEL_STARTVER;
+  DELETE FROM checkins WHERE dataset GLOB "epel*"   AND week < "$EPEL_STARTDAY";
+EOF
+counthits
+
+
+# For every table in bronto.db, delete "known bad" variants.
+#
 # Please document each new reason for cleaning something here, so we know
 # why later.
 #
@@ -13,7 +36,6 @@
 # The variant "CentOS Stream v21.*" is some sort of horrible scripted thing
 # apparently, where it keeps getting longer and longer with additional tags
 #
-
 #
 # This is a regex, in case that's not clear.
 GARBAGE_VARIANTS="^(09975472-cc15-4020-9231-cc6743a15b0f|c1e9af0e-5816-4644-9c89-703aa1fdcf04|CentOS Stream v21.*)$" 
@@ -21,6 +43,7 @@ GARBAGE_VARIANTS="^(09975472-cc15-4020-9231-cc6743a15b0f|c1e9af0e-5816-4644-9c89
 sqlite3 db/bronto.db << EOF
   DELETE FROM checkins WHERE variant REGEXP "$GARBAGE_VARIANTS";
 EOF
+counthits
 
 
 # Change generic, unknown, and '' to all be "unspecified"
@@ -38,20 +61,7 @@ sqlite3 db/bronto.db << EOF
   DELETE FROM checkins WHERE variant='generic' or variant='' or variant='none' or variant='unknown';
   COMMIT; 
 EOF
-
-# While some test systems ran Fedora Linux 31, the feature landed
-# in 32 (released 2020-04-27, so drop all the old stuff.
-FEDORA_STARTVER=32
-FEDORA_STARTDAY='2021-01-01'
-# And same for EPEL, with EL 8 (DNF feature launched in 8.3 at end of 2020)
-EPEL_STARTVER=8
-EPEL_STARTDAY='2021-01-01'
-sqlite3 db/bronto.db << EOF
-  DELETE FROM checkins WHERE dataset GLOB "fedora*" AND release < $FEDORA_STARTVER;
-  DELETE FROM checkins WHERE dataset GLOB "fedora*" AND week < "$FEDORA_STARTDAY";
-  DELETE FROM checkins WHERE dataset GLOB "epel*"   AND release < $EPEL_STARTVER;
-  DELETE FROM checkins WHERE dataset GLOB "epel*"   AND week < "$EPEL_STARTDAY";
-EOF
+counthits
 
 # Clean up entries for name, arch, or release that show up
 # with less than some threshold in _total_ hits in the
@@ -75,4 +85,5 @@ for GROUP in $(echo 'SELECT DISTINCT(dataset) FROM checkins;' | sqlite3 ./db/bro
      DELETE FROM checkins WHERE dataset = "$GROUP" AND release IN (SELECT release FROM checkins WHERE dataset = "$GROUP" GROUP BY release HAVING SUM(hits) < $THRESHOLD_TOTAL);
      DELETE FROM checkins WHERE dataset = "$GROUP" AND release IN (SELECT release FROM checkins WHERE dataset = "$GROUP" GROUP BY release HAVING SUM(hits) < $THRESHOLD_WEEKLY);
 EOF
+counthits
 done

From 4db0ac5c67fd8df2462495d9461cadeef4ce005c Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 25 2022 16:46:00 +0000
Subject: [PATCH 31/49] don't wash away datasets!


---

diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py
index 6de8639..4843a7a 100755
--- a/brontosaurus_plotter.py
+++ b/brontosaurus_plotter.py
@@ -93,8 +93,6 @@ def main():
         print(f"Dataseries '{dataseries}' not in database.")
         exit(1)       
 
-    startdate = config['startdate'][dataset]
-
     
     if arguments['timeseries']:
         params['type'] = 'timeseries'
@@ -198,28 +196,27 @@ def main():
     
     match params['graph']:
         case 'stacked':
-            df = datatable[startdate:][datatable.columns[::-1]]
+            df = datatable[datatable.columns[::-1]]
             colormap = m.colors.ListedColormap(cmap.colors[::-1])
         case 'share':
             if dataseries == 'age':
                 # lower numbers are newer!
-                df = datatable[startdate:][datatable.columns[::-1]].div(
+                df = datatable[datatable.columns[::-1]].div(
                     datatable.sum(axis=1), axis=0)*100
                 colormap = m.colors.ListedColormap(cmap.colors[::-1])
             else:
                 # todo: sort arch and variant by popularity, not name!
-                df = datatable[startdate:].div(
-                    datatable.sum(axis=1), axis=0)*100
+                df = datatable.div(datatable.sum(axis=1), axis=0)*100
                 colormap = cmap
         case 'line':
             """ This is timeseries-only. """
-            df = datatable[startdate:]
+            df = datatable
             kind = 'line' # overrides 'area'
             stacked = False # true everywhere else!
             colormap = cmap
         #case 'split':
         #    """ This is releasebar-only. """
-        #    df = datatable[startdate:]
+        #    df = datatable
         #    colormap = cmap
         #    subplots = True
             
diff --git a/brontosaurus_washer.sh b/brontosaurus_washer.sh
index 217dd88..1e3b7da 100755
--- a/brontosaurus_washer.sh
+++ b/brontosaurus_washer.sh
@@ -57,7 +57,7 @@ sqlite3 db/bronto.db << EOF
   BEGIN;
   INSERT INTO checkins SELECT week,dataset,release,"unspecified",arch,age,sum(hits) FROM checkins
     WHERE variant='generic' or variant='' or variant='none' or variant='unknown'
-    GROUP BY week,release,arch,age;
+    GROUP BY week, dataset, release, arch, age;
   DELETE FROM checkins WHERE variant='generic' or variant='' or variant='none' or variant='unknown';
   COMMIT; 
 EOF
diff --git a/config.toml b/config.toml
index cf12d38..5f5e70c 100644
--- a/config.toml
+++ b/config.toml
@@ -46,13 +46,6 @@ colors = [
 # could be png, pdf, svg
 image_types = ["png"]
 
-[startdate]
-fedora_updates_systems = '2021-01-01' # F32 release not fully captured, so start here.
-fedora_updates_containers = '2021-01-01'
-fedora_rawhide_systems = '2021-01-01'
-fedora_rawhide_containers = '2021-01-01'
-epel   = '2021-01-01'                 # DNF feature launched in 8.3 at end of 2020
-
 [dataset_labels]
 epel = "Extra Packages for Enterprise Linux"
 fedora_updates_systems = "Fedora Linux systems"

From 883de6061fc53adf4580a61a8cce113801153b66 Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 25 2022 20:22:38 +0000
Subject: [PATCH 32/49] better othering


---

diff --git a/brontosaurus_fight.sh b/brontosaurus_fight.sh
index 5726335..a322fd4 100755
--- a/brontosaurus_fight.sh
+++ b/brontosaurus_fight.sh
@@ -3,6 +3,10 @@
 # Create a view which only shows the weeks where each release
 # is at its peak. If someone actually is Good At SQL, I would
 # not mind help making this more clear.
+#
+# Also, note the special casing for Fedora Linux 32 (or earlier).
+# F32 is useful in the timeline view, but is past its peak
+# at the data start date, so we don't want it here.
 
 sqlite3 db/bronto.db << EOF
     DROP VIEW IF EXISTS peak;
@@ -19,6 +23,7 @@ sqlite3 db/bronto.db << EOF
     (SELECT week,dataset,release,max(hits)
         FROM (SELECT week,dataset,release,sum(hits) AS hits 
         FROM  checkins
+        WHERE ( release>'32' OR dataset not like 'fedora%' )
         GROUP BY week,dataset,release
         ORDER BY week) 
     GROUP BY dataset,release) AS peaks
diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py
index 4843a7a..b380940 100755
--- a/brontosaurus_plotter.py
+++ b/brontosaurus_plotter.py
@@ -9,7 +9,6 @@ Usage:
 Options:
   --exclude <arg>...  In the form `dataseries:element`. Can repeat.
   --include <arg>...  As above, but include _only_ these.
-  --cutoff <n>        Drop items where the dataseries has less than n total hits
   --output <arg>      Optional output filename (overrides config!)
 """
 
@@ -106,8 +105,11 @@ def main():
     # read in defaults from config.toml
     params.update(config[params['type']])
 
-    #pprint(params)
-
+    match params['type']:
+        case 'releasebar':
+            kind = 'bar'
+        case 'timeseries':
+            kind = 'area'
 
     table = params['table']
 
@@ -134,6 +136,7 @@ def main():
     
     xaxis = params['xaxis']
 
+
     query = f"""SELECT {xaxis}, {dataseries}, SUM(hits) as hits
                 FROM {table}
                 WHERE dataset =\"{dataset}\"
@@ -153,28 +156,37 @@ def main():
     #   datatable.resample('W-MON')
     #pprint(datatable)
 
-    #################
     # Find the items below thresholds for percent in any given
-    # dataseries entry, and also for excess number of items.
-    # Bin 'em into "other"
+    # dataseries entry, and also for excess number of items, 
+    # and bin them together into "other"
+    #
     # TODO: weight this towards the end of the data, so we don't drop
     # emerging interesting things in favor of old news?
     toosmall = datatable.div(datatable.sum(axis=1), axis=0).max() < config['minpercent'] / 100
     others = toosmall[toosmall == True].keys()
-    othercol = datatable[others].sum(axis=1).astype("Int64")
+    othercol = datatable[others].sum(axis=1).astype("Int64")    
     datatable.drop(columns=others, inplace=True)
 
     # still too big?
     if len(datatable.columns) > config['maxitems']:
         # the -1 in `config['maxitems']-1` is so we don't exceed the
         # limit by adding the "others" column!
-        others = datatable.sum().sort_values(ascending=False)[config['maxitems']-1:].index
+        others = datatable.sum(axis=0).sort_values(ascending=False)[config['maxitems']-1:].index
         othercol += datatable[others].sum(axis=1).astype("Int64")
         datatable.drop(columns=others, inplace=True)
 
-    if othercol.any():
+    # if the remaining "other" ends up big enough to matter, add it to the table
+    # the division is: highest row (release, say) for the item, compared to the total for that row
+    if othercol.any() and othercol.max() / datatable.sum(axis=1).max() >= config['minpercent'] / 100:
         datatable['other'] = othercol
 
+    # For bar charts, drop any rows (bars) which are below the threshold
+    if kind == 'bar':
+        toosmall=datatable.sum(axis=1)/datatable.sum(axis=1).max() < config['minpercent'] / 100
+        datatable.drop(toosmall[toosmall == True].keys(), inplace=True)
+
+
+
     ##################
     # our colors.
     # the complication here is keeping the same color for the same label
@@ -188,35 +200,30 @@ def main():
 
     stacked = True
 
-    match params['type']:
-        case 'releasebar':
-            kind = 'bar'
-        case 'timeseries':
-            kind = 'area'
+
     
     match params['graph']:
         case 'stacked':
-            df = datatable[datatable.columns[::-1]]
+            datatable = datatable[datatable.columns[::-1]]
             colormap = m.colors.ListedColormap(cmap.colors[::-1])
         case 'share':
             if dataseries == 'age':
                 # lower numbers are newer!
-                df = datatable[datatable.columns[::-1]].div(
+                datatable = datatable[datatable.columns[::-1]].div(
                     datatable.sum(axis=1), axis=0)*100
                 colormap = m.colors.ListedColormap(cmap.colors[::-1])
             else:
                 # todo: sort arch and variant by popularity, not name!
-                df = datatable.div(datatable.sum(axis=1), axis=0)*100
+                datatable = datatable.div(datatable.sum(axis=1), axis=0)*100
                 colormap = cmap
         case 'line':
             """ This is timeseries-only. """
-            df = datatable
+            #datatable = datatable
             kind = 'line' # overrides 'area'
             stacked = False # true everywhere else!
             colormap = cmap
         #case 'split':
         #    """ This is releasebar-only. """
-        #    df = datatable
         #    colormap = cmap
         #    subplots = True
             
@@ -226,14 +233,14 @@ def main():
     # same for both kinds, except width
     match kind:
         case 'bar':
-            graph = df.plot(figsize=config['figsize'],
+            graph = datatable.plot(figsize=config['figsize'],
                             colormap=colormap,
                             kind=kind,
                             stacked=stacked,
                             width=0.95
                         )
         case _:
-            graph = df.plot(figsize=config['figsize'],
+            graph = datatable.plot(figsize=config['figsize'],
                             colormap=colormap,
                             kind=kind,
                             stacked=stacked,
diff --git a/color-presets.toml b/color-presets.toml
index 6aefa12..7962223 100644
--- a/color-presets.toml
+++ b/color-presets.toml
@@ -8,13 +8,13 @@
 "CloudLinux" = "#0097f3"
 
 ["fedora_updates_systems.variant"]
-"unspecified" = "#cccccc"
+"unspecified" = "#a0a0a0"
 
 ["fedora_updates_containers.variant"]
-"unspecified" = "#cccccc"
+"unspecified" = "#a0a0a0"
 
 ["fedora_rawhide_systems.variant"]
-"unspecified" = "#cccccc"
+"unspecified" = "#a0a0a0"
 
 ["fedora_rawhide_containers.variant"]
-"unspecified" = "#cccccc"
+"unspecified" = "#a0a0a0"
diff --git a/config.toml b/config.toml
index 5f5e70c..601f2e0 100644
--- a/config.toml
+++ b/config.toml
@@ -46,6 +46,7 @@ colors = [
 # could be png, pdf, svg
 image_types = ["png"]
 
+
 [dataset_labels]
 epel = "Extra Packages for Enterprise Linux"
 fedora_updates_systems = "Fedora Linux systems"

From a6af5fa228398efcda70a63d6a031af169c77df8 Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 25 2022 20:34:42 +0000
Subject: [PATCH 33/49] do the other datasets too


---

diff --git a/run.sh b/run.sh
index e1cb461..0475084 100755
--- a/run.sh
+++ b/run.sh
@@ -97,7 +97,7 @@ echo "* Drawing portraits from the fossilized remains... "
  #   echo "! Oops."
  #   exit 1
  # fi
- for dataset in fedora_updates_systems epel; do
+ for dataset in fedora_updates_systems epel fedora_rawhide_systems fedora_updates_containers fedora_rawhide_containers; do
   for dataseries in age arch release variant; do
    for graph in stacked share line; do
     ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries

From 1ae5d956fd1b2cc5dc93c3785d473b44b77b917a Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 25 2022 20:49:24 +0000
Subject: [PATCH 34/49] um, yeah. simplify the colormap logic


---

diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py
index b380940..87365eb 100755
--- a/brontosaurus_plotter.py
+++ b/brontosaurus_plotter.py
@@ -186,48 +186,46 @@ def main():
         datatable.drop(toosmall[toosmall == True].keys(), inplace=True)
 
 
+    # If the items in this dataset aren't numeric,
+    # sort columns by weight
+    #if not datatable.columns.str.isnumeric().all():
 
-    ##################
-    # our colors.
-    # the complication here is keeping the same color for the same label
-    # across multiple graphs!
-    cmap = m.colors.ListedColormap(colormapping.get_colors(colormappings, config['colors'],
-                                              dataset, dataseries, list(datatable.columns)))
 
-    ##################
-    # and now.... graph it!
 
 
-    stacked = True
 
 
-    
+
+
+
+        
+    stacked = True
     match params['graph']:
         case 'stacked':
             datatable = datatable[datatable.columns[::-1]]
-            colormap = m.colors.ListedColormap(cmap.colors[::-1])
         case 'share':
             if dataseries == 'age':
                 # lower numbers are newer!
                 datatable = datatable[datatable.columns[::-1]].div(
                     datatable.sum(axis=1), axis=0)*100
-                colormap = m.colors.ListedColormap(cmap.colors[::-1])
             else:
                 # todo: sort arch and variant by popularity, not name!
                 datatable = datatable.div(datatable.sum(axis=1), axis=0)*100
-                colormap = cmap
         case 'line':
             """ This is timeseries-only. """
             #datatable = datatable
             kind = 'line' # overrides 'area'
             stacked = False # true everywhere else!
-            colormap = cmap
         #case 'split':
         #    """ This is releasebar-only. """
         #    colormap = cmap
         #    subplots = True
-            
-
+        
+    # our colors.
+    # the complication here is keeping the same color for the same label
+    # across multiple graphs!
+    colormap = m.colors.ListedColormap(colormapping.get_colors(colormappings, config['colors'],
+                                              dataset, dataseries, list(datatable.columns)))
 
     # Start the actual graph.
     # same for both kinds, except width
@@ -258,7 +256,7 @@ def main():
         labels = list(map(config['age_labels'].get, labels))
 
     # hmmm
-    if params['graph'] == 'stacked' or params['graph'] == 'shared':
+    if params['graph'] == 'stacked' or params['graph'] == 'share':
         handles[:] = handles[::-1]
         labels[:] = labels[::-1]
 

From e9196b6faf964e3bc3cc08ffb4cc48dc0a4a4f62 Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 25 2022 21:35:30 +0000
Subject: [PATCH 35/49] better ordering -- good enough for now!


---

diff --git a/TODO.md b/TODO.md
index c54c9ef..8d01bbc 100644
--- a/TODO.md
+++ b/TODO.md
@@ -11,7 +11,6 @@
  4. have some script that pre-renders some defaults (PARTIAL)
  5. and a simple front-end for exploring the rest
 
-* better ordering
 
 * epel -- need to special-case EL 8 by-release graphs to add peak for
   both before and after CentOS Linux 8 EOL
diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py
index 87365eb..f57f811 100755
--- a/brontosaurus_plotter.py
+++ b/brontosaurus_plotter.py
@@ -188,37 +188,33 @@ def main():
 
     # If the items in this dataset aren't numeric,
     # sort columns by weight
-    #if not datatable.columns.str.isnumeric().all():
-
-
-
-
-
-
-
-
+    if datatable.columns.dtype != 'int64':
+        if not datatable.columns.str.isnumeric().all():
+            datatable = datatable.reindex(datatable.sum(axis=0).sort_values(ascending=True).index,axis=1)
 
+    if dataseries == 'release':
+        # treat more recent releases as "lower" conceptually
+        datatable = datatable[datatable.columns[::-1]]
         
     stacked = True
+    yformatter = m.ticker.EngFormatter(sep='')
     match params['graph']:
         case 'stacked':
+            # invert!
             datatable = datatable[datatable.columns[::-1]]
         case 'share':
-            if dataseries == 'age':
-                # lower numbers are newer!
-                datatable = datatable[datatable.columns[::-1]].div(
-                    datatable.sum(axis=1), axis=0)*100
-            else:
-                # todo: sort arch and variant by popularity, not name!
-                datatable = datatable.div(datatable.sum(axis=1), axis=0)*100
+            # also invert
+            datatable = datatable[datatable.columns[::-1]]
+            # and convert to percent
+            datatable = datatable.div(datatable.sum(axis=1), axis=0)*100
+            yformatter = m.ticker.PercentFormatter()
         case 'line':
             """ This is timeseries-only. """
-            #datatable = datatable
             kind = 'line' # overrides 'area'
             stacked = False # true everywhere else!
         #case 'split':
         #    """ This is releasebar-only. """
-        #    colormap = cmap
+
         #    subplots = True
         
     # our colors.
@@ -248,17 +244,19 @@ def main():
     ax = plt.gca()
 
     handles, labels = ax.get_legend_handles_labels()
-
+    # default direction seems backwards to me!
+    handles[:] = handles[::-1]
+    labels[:] = labels[::-1]
 
 
     # TODO: generalize this
     if dataseries == 'age':
         labels = list(map(config['age_labels'].get, labels))
-
-    # hmmm
-    if params['graph'] == 'stacked' or params['graph'] == 'share':
-        handles[:] = handles[::-1]
-        labels[:] = labels[::-1]
+        if kind == 'line':
+            # put it back the other way for this case!
+            handles[:] = handles[::-1]
+            labels[:] = labels[::-1]
+            
 
     plt.legend(handles, labels, loc='center left',
                 bbox_to_anchor=(1.0, 0.5))
@@ -290,7 +288,7 @@ def main():
     graph.set_ylim([0, None])
     graph.spines['right'].set_visible(False)
     graph.spines['top'].set_visible(False)
-    graph.yaxis.set_major_formatter(m.ticker.EngFormatter(sep=''))
+    graph.yaxis.set_major_formatter(yformatter)
     # graph.xaxis.set_major_formatter(dates.DateFormatter('%b \'%y'))
     graph.set_xlabel('')
 

From 97eb8055ed407873000461417691bcba1f6bbf85 Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 26 2022 00:23:01 +0000
Subject: [PATCH 36/49] ehhh this is not good. but getting there!


---

diff --git a/TODO.md b/TODO.md
index 8d01bbc..e39af67 100644
--- a/TODO.md
+++ b/TODO.md
@@ -15,6 +15,7 @@
 * epel -- need to special-case EL 8 by-release graphs to add peak for
   both before and after CentOS Linux 8 EOL
 
+* add "remove bad characters!" from cleanup script
 
 * text reports!!!
   * this week / last week / year-over-year
diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py
index f57f811..899d23f 100755
--- a/brontosaurus_plotter.py
+++ b/brontosaurus_plotter.py
@@ -3,16 +3,13 @@
 Brontosaurus Plotter
 
 Usage:
-  brontosaurus_plotter.py timeseries ( text | stacked | share | line ) <dataset> <dataseries> [--exclude <arg>...|--include <arg>...] [options]
-  brontosaurus_plotter.py releasebar ( text | stacked | share )        <dataset> <dataseries> [--exclude <arg>...|--include <arg>...] [options]
+  brontosaurus_plotter.py timeseries ( text | stacked | share | line ) <dataset> <dataseries> [ <filter>... ] [options]
+  brontosaurus_plotter.py releasebar ( text | stacked | share )        <dataset> <dataseries> [ <filter>... ] [options]
 
 Options:
-  --exclude <arg>...  In the form `dataseries:element`. Can repeat.
-  --include <arg>...  As above, but include _only_ these.
-  --output <arg>      Optional output filename (overrides config!)
+  --output <arg>     Optional output filename (overrides config!)
 """
 
-#import matplotlib.dates as dates
 
 import sqlite3
 from string import Template
@@ -29,6 +26,8 @@ import matplotlib.pyplot as plt
 
 from docopt import docopt
 
+from collections import defaultdict
+
 from brontosaurusifier_utils import colormapping
 
 m.use("Agg")
@@ -73,8 +72,6 @@ def main():
     dataserieses.remove('hits')
     dataserieses.remove('week')
 
-    #pprint(arguments)
-
     
     dataset=arguments['<dataset>']
     dataseries=arguments['<dataseries>']
@@ -90,9 +87,47 @@ def main():
         exit(1)
     if not dataseries in dataserieses:
         print(f"Dataseries '{dataseries}' not in database.")
-        exit(1)       
+        exit(1)
 
+    filterincludes = defaultdict(set)
+    filter = ""
+
+    if arguments['<filter>']:
+        for f in arguments['<filter>']:
+            if "'" in f:
+                # TODO: actual, good validation
+                print("No please.")
+                exit(2)
+            try:
+                (filterseries,filteritem) = f.split('+',1)
+                filterincludes[filterseries].add(filteritem)
+            except ValueError:
+                try:
+                    (filterseries,filteritem) = f.split('-',1)
+                    filter+=f" AND {filterseries} != '{filteritem}'"
+                except ValueError:
+                    print(f"Filter `{f}` is not valid. Must be `dataseries+item` or `dataseries-item`")
+                    exit(1)
+            if not filterseries in dataserieses:
+                print(f"Filter `{f}` doesn't match a dataseries. (Try `variant`, `release`, `arch`, or `age`.)")
+                exit(1)
+ 
+    # TODO: stop this nonsense, use a proper ORM
+
+    for (incseries,incitems) in filterincludes.items():
+        # TODO parens only if needed
+        filter += " AND ("
+        inclist=set()
+        for incitem in incitems:
+            inclist.add(f"{incseries} = {incitem}")
+        filter += " OR ".join(inclist)
+        filter += " ) "        
     
+    if filter:
+        filterstring = ":" + ":".join(sorted(arguments['<filter>'])).replace(" ","_")
+    else:
+        filterstring = ""
+
     if arguments['timeseries']:
         params['type'] = 'timeseries'
     elif arguments['releasebar']:
@@ -120,14 +155,16 @@ def main():
             break
         
 
-    if params['graph'] == 'text':            
+    if params['graph'] == 'text': 
         query = f"""SELECT {dataseries},sum(hits) AS total
                 FROM {table}
                 WHERE dataset = '{dataset}'
+                {filter}
                 GROUP BY {dataseries}
                 ORDER BY total
                 ASC
         """
+
         cur.execute(query)
         # TODO: add title here
         for (item,hits) in cur:
@@ -140,9 +177,11 @@ def main():
     query = f"""SELECT {xaxis}, {dataseries}, SUM(hits) as hits
                 FROM {table}
                 WHERE dataset =\"{dataset}\"
+                {filter}
                 GROUP BY {xaxis}, {dataseries}
                 ORDER BY {xaxis}
                 """
+    print(query)
     df = pd.read_sql_query(query, parse_dates='week', con=database)
 
 
@@ -271,6 +310,7 @@ def main():
                'dataset_label': config['dataset_labels'][dataset],
                'view_label': config['view_labels'][params['graph']],
                'type_label': params['label'],
+               'filter' : filterstring,
               }
 
     if 'title' in params:
@@ -304,7 +344,7 @@ def main():
         graph.figure.savefig(f"images/{ext}/{dataset}/{Template(params['filebase']).safe_substitute(madlibs)}.{ext}",
                                 dpi=config['dpi'], bbox_inches="tight")
 
-    #graph.figure.savefig(f"images/test.png", dpi=config['dpi'], bbox_inches="tight")
+    graph.figure.savefig(f"images/test.png", dpi=config['dpi'], bbox_inches="tight")
 
     plt.close(graph.figure)
     print(f"{Template(params['filebase']).safe_substitute(madlibs)}.{ext}")
diff --git a/config.toml b/config.toml
index 601f2e0..95a10e5 100644
--- a/config.toml
+++ b/config.toml
@@ -6,7 +6,7 @@ color_cache = "db/color-cache.toml"
 imagepath="images/$filetype/$dataset/"
 
 figsize = [16, 9]
-dpi = 300
+dpi = 150
 
 # Entries where the highest value for any
 # week (or release) is not above this 
@@ -77,13 +77,12 @@ week="per week"
 [graph_defaults]
 table="checkins"
 title="$dataset_label: $dataseries_label $type_label"
-subtitle="$extra"
-filebase="$timestamp-$dataset-$type-$dataseries$extra-$graph"
+subtitle="$filter"
+filebase="$timestamp-$dataset-$type-$graph-$dataseries$filter"
 # TODO: list possible options!
 
 [timeseries]
 label="weekly checkins"
-subtitle=""
 xaxis = "week"
 extraselect=""
 
@@ -91,7 +90,7 @@ extraselect=""
 label="by release"
 table="peak"
 xaxis = "release"
-subtitle="data for each release taken from the week of that release's (current) peak"
+subtitle="$filter (at release peak)"
 
 [waffleplot]
 subtitle="tk!"
\ No newline at end of file

From eeff2528d7c680b2c4d0a3174bb9683d08db8f19 Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 26 2022 14:55:27 +0000
Subject: [PATCH 37/49] there that's better -- minpercent now works


---

diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py
index 899d23f..9ea613e 100755
--- a/brontosaurus_plotter.py
+++ b/brontosaurus_plotter.py
@@ -181,7 +181,6 @@ def main():
                 GROUP BY {xaxis}, {dataseries}
                 ORDER BY {xaxis}
                 """
-    print(query)
     df = pd.read_sql_query(query, parse_dates='week', con=database)
 
 
@@ -199,9 +198,24 @@ def main():
     # dataseries entry, and also for excess number of items, 
     # and bin them together into "other"
     #
+    # We consider three things: 
+    # 
+    # * percent of at least one row (week or release
+    #   depending on chart type) must exceed minpercent
+    #
+    # * percent of total must also.
+    #
+    # * but wait, if the percent of any of the last four
+    #   rows is above the threshold, keep that after all
+    # 
     # TODO: weight this towards the end of the data, so we don't drop
     # emerging interesting things in favor of old news?
-    toosmall = datatable.div(datatable.sum(axis=1), axis=0).max() < config['minpercent'] / 100
+    # 
+    # old way: toosmall = datatable.div(datatable.sum(axis=1), axis=0).max() < config['minpercent'] / 100
+    # 
+    toosmall = datatable[datatable==datatable.max()].div(datatable.sum(axis=1),axis=0).max() < config['minpercent'] / 100
+    toosmall |= datatable.sum()/datatable.sum().sum() < config['minpercent'] / 100
+    toosmall &= datatable[-4:].div(datatable.sum(axis=1),axis=0).fillna(0).max() < config['minpercent'] / 100
     others = toosmall[toosmall == True].keys()
     othercol = datatable[others].sum(axis=1).astype("Int64")    
     datatable.drop(columns=others, inplace=True)
diff --git a/config.toml b/config.toml
index 95a10e5..6db9d5f 100644
--- a/config.toml
+++ b/config.toml
@@ -21,6 +21,7 @@ minpercent = 0.5
 # "other" line, if any.
 maxitems = 10
 
+
 # Our palette. Note that if `maxitems`
 # is greater than the number of options in
 # this list, it will cycle around!

From d7e22c27cf57e7f0990077bf59d858cfcdd89a9a Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 26 2022 15:26:57 +0000
Subject: [PATCH 38/49] optimization notes


---

diff --git a/TODO.md b/TODO.md
index e39af67..fae582d 100644
--- a/TODO.md
+++ b/TODO.md
@@ -31,6 +31,21 @@
      * architecture as above
      * share of category (desktop/server-cloud-iot/labs)
 
+* profiling notes for the plotter:
+  * A full 75% that can't really be optimized
+    * 48% pandas graph call
+    * 27% savefig call
+  * But this could probably be better:
+   * 17.2% the main pd.read_sql_query
+   *  0.6% datatable pivot
+  * And these things are redundant:
+    *  0.3% docopt
+    *  0.9% loading config toml
+    *  2.2% loading the color cache
+    *  1.2% reading the names of the datasets :(
+    *  1.7% the "other" filtering (not huge but probably an easy fix)
+  * (That all accounts for 99.1% of time, so... not much else to improve!)
+
 * use jinjasql for the query templates?
 * sanitize everything coming from config.toml, really.
 
diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py
index 9ea613e..71abcb1 100755
--- a/brontosaurus_plotter.py
+++ b/brontosaurus_plotter.py
@@ -38,9 +38,6 @@ m.rcParams['font.family'] = 'Montserrat'
 m.rcParams['legend.frameon'] = False
 
 
-
-
-
 def main():
 
     # TODO: separate initialization (leave in main) from rendering one
@@ -358,7 +355,7 @@ def main():
         graph.figure.savefig(f"images/{ext}/{dataset}/{Template(params['filebase']).safe_substitute(madlibs)}.{ext}",
                                 dpi=config['dpi'], bbox_inches="tight")
 
-    graph.figure.savefig(f"images/test.png", dpi=config['dpi'], bbox_inches="tight")
+    #graph.figure.savefig(f"images/test.png", dpi=config['dpi'], bbox_inches="tight")
 
     plt.close(graph.figure)
     print(f"{Template(params['filebase']).safe_substitute(madlibs)}.{ext}")

From adb4a888a98a6927a30e43eed8e050207d8f3d10 Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 26 2022 15:48:49 +0000
Subject: [PATCH 39/49] allow "age+ephemeral" and "age+persistent"


---

diff --git a/TODO.md b/TODO.md
index fae582d..a05f228 100644
--- a/TODO.md
+++ b/TODO.md
@@ -3,9 +3,8 @@
  1. Save the color mappings to a file as a separate step
     * using defaults from config (DONE)
  2. change brontosaurus-plotter to render _one_ image per call
-    * first pass done
-    * with a syntax for what to include or exclude by name
-    * and possibly with some number options?
+    * first pass (DONE)
+    * with a syntax for what to include or exclude by name (DONE)
     * add back ephemeral, or is that just a subset of the above?
  3. make timeline, releasebar, and waffle be separate commands (DONE)
  4. have some script that pre-renders some defaults (PARTIAL)
@@ -15,7 +14,11 @@
 * epel -- need to special-case EL 8 by-release graphs to add peak for
   both before and after CentOS Linux 8 EOL
 
-* add "remove bad characters!" from cleanup script
+* add "remove bad characters!" to cleanup script
+
+* add numeric options (less than, greater than) to plotter filter
+
+* validate that filter items exist in the data?
 
 * text reports!!!
   * this week / last week / year-over-year
diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py
index 71abcb1..d4731c9 100755
--- a/brontosaurus_plotter.py
+++ b/brontosaurus_plotter.py
@@ -86,14 +86,31 @@ def main():
         print(f"Dataseries '{dataseries}' not in database.")
         exit(1)
 
+
+    if arguments['timeseries']:
+        params['type'] = 'timeseries'
+    elif arguments['releasebar']:
+        params['type'] = 'releasebar'
+        if dataseries == 'release':
+            print("Plotting release by release makes no sense.")
+            exit(1)
+    elif arguments['waffleplot']:
+        print("Waffle plots not yet implemented.")
+        exit(1)
+    
+
+    # this parses the command line for the filter parameters.
+    # It then constructs SQL from those.
+    # TODO: stop this nonsense, use a proper ORM
+    
     filterincludes = defaultdict(set)
     filter = ""
 
     if arguments['<filter>']:
         for f in arguments['<filter>']:
-            if "'" in f:
+            if not re.match('^[0-9A-Za-z_ \+\-]*$', f):
                 # TODO: actual, good validation
-                print("No please.")
+                print(f"Invalid characters in `{f}`. If this is legit, file a bug please.")
                 exit(2)
             try:
                 (filterseries,filteritem) = f.split('+',1)
@@ -109,14 +126,21 @@ def main():
                 print(f"Filter `{f}` doesn't match a dataseries. (Try `variant`, `release`, `arch`, or `age`.)")
                 exit(1)
  
-    # TODO: stop this nonsense, use a proper ORM
 
     for (incseries,incitems) in filterincludes.items():
         # TODO parens only if needed
         filter += " AND ("
         inclist=set()
         for incitem in incitems:
-            inclist.add(f"{incseries} = {incitem}")
+            # special case age!
+            if incseries=='age':
+                if incitem=='persistent':
+                    inclist.add(f"age > 0")
+                    continue
+                elif incitem=='ephemeral':
+                    inclist.add(f"age = 0")
+                    continue
+            inclist.add(f"{incseries} = '{incitem}'")
         filter += " OR ".join(inclist)
         filter += " ) "        
     
@@ -125,14 +149,7 @@ def main():
     else:
         filterstring = ""
 
-    if arguments['timeseries']:
-        params['type'] = 'timeseries'
-    elif arguments['releasebar']:
-        params['type'] = 'releasebar'
-        if dataseries == 'release':
-            print("Plotting release by release makes no sense.")
-            exit(1)
-    # TODO: waffle!
+    
 
     # read in defaults from config.toml
     params.update(config[params['type']])

From 531ccdbbacb69b865984c9912171be47d22b96ae Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 26 2022 19:59:12 +0000
Subject: [PATCH 40/49] wash better


---

diff --git a/TODO.md b/TODO.md
index a05f228..0bcaea5 100644
--- a/TODO.md
+++ b/TODO.md
@@ -18,8 +18,15 @@
 
 * add numeric options (less than, greater than) to plotter filter
 
+* clean up the horrible filter hack code
+  * cosmetic: group includes and excludes of the same series
+    so they can be displayed pretty
+  * validate that the excludes and includes don't overlap
 * validate that filter items exist in the data?
 
+* cleanup: get my terms straight for
+  data set, series, axis, column, row, point, item
+
 * text reports!!!
   * this week / last week / year-over-year
      * total systems / total persistent / total ephemeral (+%)
diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py
index d4731c9..4a1fb71 100755
--- a/brontosaurus_plotter.py
+++ b/brontosaurus_plotter.py
@@ -11,6 +11,7 @@ Options:
 """
 
 
+import os
 import sqlite3
 from string import Template
 
@@ -102,25 +103,30 @@ def main():
     # this parses the command line for the filter parameters.
     # It then constructs SQL from those.
     # TODO: stop this nonsense, use a proper ORM
+    # TODO: or really literally anything else
     
     filterincludes = defaultdict(set)
     filter = ""
+    includelist = set()
+    excludelist = set()
 
     if arguments['<filter>']:
         for f in arguments['<filter>']:
-            if not re.match('^[0-9A-Za-z_ \+\-]*$', f):
+            if not re.match('^[0-9A-Za-z_ =\!\-]*$', f):
                 # TODO: actual, good validation
                 print(f"Invalid characters in `{f}`. If this is legit, file a bug please.")
                 exit(2)
             try:
-                (filterseries,filteritem) = f.split('+',1)
+                (filterseries,filteritem) = f.split('=',1)
                 filterincludes[filterseries].add(filteritem)
+                includelist.add(f"{filterseries} = {filteritem}")
             except ValueError:
                 try:
-                    (filterseries,filteritem) = f.split('-',1)
+                    (filterseries,filteritem) = f.split('!',1)
                     filter+=f" AND {filterseries} != '{filteritem}'"
+                    excludelist.add(f"{filterseries} = {filteritem}")
                 except ValueError:
-                    print(f"Filter `{f}` is not valid. Must be `dataseries+item` or `dataseries-item`")
+                    print(f"Filter `{f}` is not valid. Must be `dataseries:item` or `dataseries-item`")
                     exit(1)
             if not filterseries in dataserieses:
                 print(f"Filter `{f}` doesn't match a dataseries. (Try `variant`, `release`, `arch`, or `age`.)")
@@ -145,11 +151,20 @@ def main():
         filter += " ) "        
     
     if filter:
-        filterstring = ":" + ":".join(sorted(arguments['<filter>'])).replace(" ","_")
+        filterstring = "_" + "_".join(sorted(arguments['<filter>'])).replace(" ","_").lower()
     else:
         filterstring = ""
 
-    
+    # haccckkkky!
+
+    if includelist:
+        includetext = "Included: " + ', '.join(includelist)
+    else:
+        includetext = ""
+    if excludelist:
+        excludetext = "\nExcluded: " + ', '.join(excludelist)
+    else:
+        excludetext = ""
 
     # read in defaults from config.toml
     params.update(config[params['type']])
@@ -197,6 +212,9 @@ def main():
                 """
     df = pd.read_sql_query(query, parse_dates='week', con=database)
 
+    if len(df) == 0:
+        print(f"No data for\n${query}")
+        exit(0)
 
     datatable=df.pivot(index=xaxis,
                        columns=dataseries,
@@ -339,6 +357,8 @@ def main():
                'view_label': config['view_labels'][params['graph']],
                'type_label': params['label'],
                'filter' : filterstring,
+               'excludes' : excludetext,
+               'includes' : includetext,
               }
 
     if 'title' in params:
@@ -368,8 +388,13 @@ def main():
     # Not sure why these get rotated by default. Unrotate them!
     plt.xticks(rotation = 0)
 
+
     for ext in config['image_types']:
-        graph.figure.savefig(f"images/{ext}/{dataset}/{Template(params['filebase']).safe_substitute(madlibs)}.{ext}",
+        imagepath = Template(config['imagepath']).safe_substitute(madlibs)
+        imagepath = Template(imagepath).safe_substitute({ 'filetype': ext })
+        if filter and not ( filterstring == "_age=ephemeral" or filterstring != "_age=persistant" ):
+            imagepath += "/filtered"
+        graph.figure.savefig(f"{imagepath}/{Template(params['filebase']).safe_substitute(madlibs)}.{ext}",
                                 dpi=config['dpi'], bbox_inches="tight")
 
     #graph.figure.savefig(f"images/test.png", dpi=config['dpi'], bbox_inches="tight")
diff --git a/brontosaurus_washer.sh b/brontosaurus_washer.sh
index 1e3b7da..e8758ae 100755
--- a/brontosaurus_washer.sh
+++ b/brontosaurus_washer.sh
@@ -1,10 +1,14 @@
 #!/bin/bash
 #
 
+if [ "$1" == "-v" ]; then
+  VERBOSE=1
+fi
 
 function counthits() {
-  true
-  #echo 'SELECT sum(hits) FROM checkins' | sqlite3 db/bronto.db;
+  if [ "$VERBOSE" = 1 ]; then
+    echo 'SELECT sum(hits) FROM checkins' | sqlite3 db/bronto.db;
+  fi
 }
 
 counthits
@@ -73,17 +77,25 @@ counthits
 # Note that since we regenerate the whole db from totals.db 
 # each week, if something exceeds this threshold later, it will
 # suddenly appear
-THRESHOLD_TOTAL=100
-THRESHOLD_WEEKLY=3
+THRESHOLD_TOTAL=$(  echo 'SELECT MAX(total)/100000 FROM ( SELECT variant,SUM(hits) AS total FROM CHECKINS GROUP BY variant );' | sqlite3 ./db/bronto.db )
+THRESHOLD_WEEKLY=$( echo 'SELECT MAX(total)/100000 FROM ( SELECT variant,MAX(hits) AS total FROM CHECKINS GROUP BY variant );' | sqlite3 ./db/bronto.db )
+
 
 for GROUP in $(echo 'SELECT DISTINCT(dataset) FROM checkins;' | sqlite3 ./db/bronto.db); do
-   sqlite3 db/bronto.db << EOF
-     DELETE FROM checkins WHERE dataset = "$GROUP" AND variant IN (SELECT variant FROM checkins WHERE dataset = "$GROUP" GROUP BY variant HAVING SUM(hits) < $THRESHOLD_TOTAL);
-     DELETE FROM checkins WHERE dataset = "$GROUP" AND variant IN (SELECT variant FROM checkins WHERE dataset = "$GROUP" GROUP BY variant HAVING SUM(hits) < $THRESHOLD_WEEKLY);
-     DELETE FROM checkins WHERE dataset = "$GROUP" AND arch    IN (SELECT arch    FROM checkins WHERE dataset = "$GROUP" GROUP BY arch    HAVING SUM(hits) < $THRESHOLD_TOTAL);
-     DELETE FROM checkins WHERE dataset = "$GROUP" AND arch    IN (SELECT arch    FROM checkins WHERE dataset = "$GROUP" GROUP BY arch    HAVING SUM(hits) < $THRESHOLD_WEEKLY);
-     DELETE FROM checkins WHERE dataset = "$GROUP" AND release IN (SELECT release FROM checkins WHERE dataset = "$GROUP" GROUP BY release HAVING SUM(hits) < $THRESHOLD_TOTAL);
-     DELETE FROM checkins WHERE dataset = "$GROUP" AND release IN (SELECT release FROM checkins WHERE dataset = "$GROUP" GROUP BY release HAVING SUM(hits) < $THRESHOLD_WEEKLY);
+  # hello, hack!
+  if [[ "$GROUP" == "epel" ]]; then
+    THRESHOLD_TOTAL=$(( THRESHOLD_TOTAL * 2 ))
+    THRESHOLD_WEEKLY=$(( THRESHOLD_WEEKLY * 2 ))
+  fi
+  sqlite3 db/bronto.db << EOF
+    BEGIN;
+    DELETE FROM checkins WHERE dataset = "$GROUP" AND variant IN (SELECT variant FROM checkins WHERE dataset = "$GROUP" GROUP BY variant HAVING SUM(hits) < $THRESHOLD_TOTAL);
+    DELETE FROM checkins WHERE dataset = "$GROUP" AND variant IN (SELECT variant FROM checkins WHERE dataset = "$GROUP" GROUP BY variant HAVING MAX(hits) < $THRESHOLD_WEEKLY);
+    DELETE FROM checkins WHERE dataset = "$GROUP" AND arch    IN (SELECT arch    FROM checkins WHERE dataset = "$GROUP" GROUP BY arch    HAVING SUM(hits) < $THRESHOLD_TOTAL);
+    DELETE FROM checkins WHERE dataset = "$GROUP" AND arch    IN (SELECT arch    FROM checkins WHERE dataset = "$GROUP" GROUP BY arch    HAVING MAX(hits) < $THRESHOLD_WEEKLY);
+    DELETE FROM checkins WHERE dataset = "$GROUP" AND release IN (SELECT release FROM checkins WHERE dataset = "$GROUP" GROUP BY release HAVING SUM(hits) < $THRESHOLD_TOTAL);
+    DELETE FROM checkins WHERE dataset = "$GROUP" AND release IN (SELECT release FROM checkins WHERE dataset = "$GROUP" GROUP BY release HAVING MAX(hits) < $THRESHOLD_WEEKLY);
+    COMMIT;
 EOF
 counthits
 done
diff --git a/config.toml b/config.toml
index 6db9d5f..8a3bfee 100644
--- a/config.toml
+++ b/config.toml
@@ -3,20 +3,23 @@ datafile = "db/bronto.db"
 color_presets = "color-presets.toml"
 color_cache = "db/color-cache.toml"
 
-imagepath="images/$filetype/$dataset/"
+imagepath = "images/$filetype/$dataset"
 
 figsize = [16, 9]
 dpi = 150
 
-# Entries where the highest value for any
+# Columns where the highest value for any
 # week (or release) is not above this 
 # percent of the total for that week (or
 # release) will be binned together into
-# "other"
+# "other". This also applies (separately!)
+# to columns where the total _cumulatively_
+# does not exceed this percent of the
+# total of all columns.
 minpercent = 0.5
+ 
 
-
-# Also bin excess entries with "other".
+# Also bin excess columns with "other".
 # Note that this limit *does* include the
 # "other" line, if any.
 maxitems = 10
@@ -78,7 +81,7 @@ week="per week"
 [graph_defaults]
 table="checkins"
 title="$dataset_label: $dataseries_label $type_label"
-subtitle="$filter"
+subtitle="$includes $excludes"
 filebase="$timestamp-$dataset-$type-$graph-$dataseries$filter"
 # TODO: list possible options!
 
@@ -91,7 +94,7 @@ extraselect=""
 label="by release"
 table="peak"
 xaxis = "release"
-subtitle="$filter (at release peak)"
+subtitle="$includes $excludes (at release peak)"
 
 [waffleplot]
 subtitle="tk!"
\ No newline at end of file
diff --git a/run.sh b/run.sh
index 0475084..fcebfbd 100755
--- a/run.sh
+++ b/run.sh
@@ -44,6 +44,8 @@ echo -n "* Fossilizing ancient images... "
   rm images/png/* 2> /dev/null
   rm images/svg/*/* 2> /dev/null
   rm images/png/*/* 2> /dev/null
+  rm images/svg/*/*/* 2> /dev/null
+  rm images/png/*/*/* 2> /dev/null
 echo " buried."
 
 echo -n "* Slicing brontosauruses... "
@@ -80,7 +82,7 @@ echo "           binaried."
 
 echo -n "* Creating exhibit cages..." 
   for dataset in $(echo 'SELECT DISTINCT(dataset) FROM checkins;' | sqlite3 ./db/bronto.db); do
-    mkdir -p images/{svg,png}/$dataset
+    mkdir -p images/{svg,png}/$dataset/filtered
   done
 echo "      built!"
 
@@ -92,21 +94,5 @@ echo "       vibrant!"
 echo "* Drawing portraits from the fossilized remains... "
  #LINES=$(toml -dj < config.toml |jq '.timeseries[].views.value[].value'|wc -l)
  #./brontosaurus_plotter.py | pv -F "  %p %e" -w60 -l -s $LINES > /dev/null
-  
- #if [[ $? != 0 ]]; then
- #   echo "! Oops."
- #   exit 1
- # fi
- for dataset in fedora_updates_systems epel fedora_rawhide_systems fedora_updates_containers fedora_rawhide_containers; do
-  for dataseries in age arch release variant; do
-   for graph in stacked share line; do
-    ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries
-   done
-  done
-  for dataseries in age arch variant; do
-   for graph in stacked share; do
-    ./brontosaurus_plotter.py releasebar $graph $dataset $dataseries
-   done
-  done
- done
+./bronosaurus_plotall.sh  
  echo "  Beautiful."

From c3aba618ab2721a5814146bc31d8de0cbe5103d6 Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 26 2022 19:59:28 +0000
Subject: [PATCH 41/49] this is temporary


---

diff --git a/brontosaurus_plotall.sh b/brontosaurus_plotall.sh
new file mode 100755
index 0000000..4619aa7
--- /dev/null
+++ b/brontosaurus_plotall.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+for dataset in fedora_updates_systems epel fedora_rawhide_systems fedora_updates_containers fedora_rawhide_containers; do
+  for dataseries in age arch release variant; do
+    for graph in stacked share line; do
+      ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries
+      if [ "$dataseries" != "age"]; fi
+        ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries age=persistent
+        ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries age=ephemeral
+      fi
+      for sub in release variant; do
+        INC=$(echo "select $sub from checkins where dataset='$dataset' group by $sub;"|sqlite3 ./db/bronto.db )
+        for inc in $INC; do
+          ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries $sub=$inc
+          if [ "$dataseries" != "age"]; fi
+            ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries $sub=$inc age=persistent
+            ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries $sub=$inc age=ephemeral
+          fi
+        done
+      done
+   done
+  done
+  for dataseries in age arch variant; do
+    for graph in stacked share; do
+      ./brontosaurus_plotter.py releasebar $graph $dataset $dataseries
+      if [ "$dataseries" != "age"]; fi
+        ./brontosaurus_plotter.py releasebar $graph $dataset $dataseries age=persistent
+        ./brontosaurus_plotter.py releasebar $graph $dataset $dataseries age=emphemeral
+      fi
+      for sub in age variant; do
+        INC=$(echo "select $sub from checkins where dataset='$dataset' group by $sub;"|sqlite3 ./db/bronto.db )
+        for inc in $INC; do
+          ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries $sub=$inc
+          if [ "$dataseries" != "age" ] && [ "$sub" != "age" ]; fi
+            ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries $sub=$inc age=persistent
+            ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries $sub=$inc age=ephemeral
+          fi
+        done
+      done
+    done
+  done
+ done

From 270b55d476159ea2359fef940361d7c0dcdbe99b Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 26 2022 19:59:58 +0000
Subject: [PATCH 42/49] speling


---

diff --git a/run.sh b/run.sh
index fcebfbd..3f8c885 100755
--- a/run.sh
+++ b/run.sh
@@ -94,5 +94,5 @@ echo "       vibrant!"
 echo "* Drawing portraits from the fossilized remains... "
  #LINES=$(toml -dj < config.toml |jq '.timeseries[].views.value[].value'|wc -l)
  #./brontosaurus_plotter.py | pv -F "  %p %e" -w60 -l -s $LINES > /dev/null
-./bronosaurus_plotall.sh  
+./brontosaurus_plotall.sh  
  echo "  Beautiful."

From fafce2140a2a33b54e683a17ae26410892e9821e Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 26 2022 20:01:00 +0000
Subject: [PATCH 43/49] sshhhh you didn't see that


---

diff --git a/brontosaurus_plotall.sh b/brontosaurus_plotall.sh
index 4619aa7..6b53425 100755
--- a/brontosaurus_plotall.sh
+++ b/brontosaurus_plotall.sh
@@ -4,7 +4,7 @@ for dataset in fedora_updates_systems epel fedora_rawhide_systems fedora_updates
   for dataseries in age arch release variant; do
     for graph in stacked share line; do
       ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries
-      if [ "$dataseries" != "age"]; fi
+      if [ "$dataseries" != "age"]; then
         ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries age=persistent
         ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries age=ephemeral
       fi
@@ -12,7 +12,7 @@ for dataset in fedora_updates_systems epel fedora_rawhide_systems fedora_updates
         INC=$(echo "select $sub from checkins where dataset='$dataset' group by $sub;"|sqlite3 ./db/bronto.db )
         for inc in $INC; do
           ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries $sub=$inc
-          if [ "$dataseries" != "age"]; fi
+          if [ "$dataseries" != "age"]; then
             ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries $sub=$inc age=persistent
             ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries $sub=$inc age=ephemeral
           fi
@@ -23,7 +23,7 @@ for dataset in fedora_updates_systems epel fedora_rawhide_systems fedora_updates
   for dataseries in age arch variant; do
     for graph in stacked share; do
       ./brontosaurus_plotter.py releasebar $graph $dataset $dataseries
-      if [ "$dataseries" != "age"]; fi
+      if [ "$dataseries" != "age"]; then
         ./brontosaurus_plotter.py releasebar $graph $dataset $dataseries age=persistent
         ./brontosaurus_plotter.py releasebar $graph $dataset $dataseries age=emphemeral
       fi
@@ -31,7 +31,7 @@ for dataset in fedora_updates_systems epel fedora_rawhide_systems fedora_updates
         INC=$(echo "select $sub from checkins where dataset='$dataset' group by $sub;"|sqlite3 ./db/bronto.db )
         for inc in $INC; do
           ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries $sub=$inc
-          if [ "$dataseries" != "age" ] && [ "$sub" != "age" ]; fi
+          if [ "$dataseries" != "age" ] && [ "$sub" != "age" ]; then
             ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries $sub=$inc age=persistent
             ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries $sub=$inc age=ephemeral
           fi

From 363cbdb73476fd8aff4bd9c72dc85b8ecefb5828 Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 26 2022 20:01:23 +0000
Subject: [PATCH 44/49] everhything is terrible


---

diff --git a/brontosaurus_plotall.sh b/brontosaurus_plotall.sh
index 6b53425..37ee2af 100755
--- a/brontosaurus_plotall.sh
+++ b/brontosaurus_plotall.sh
@@ -4,7 +4,7 @@ for dataset in fedora_updates_systems epel fedora_rawhide_systems fedora_updates
   for dataseries in age arch release variant; do
     for graph in stacked share line; do
       ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries
-      if [ "$dataseries" != "age"]; then
+      if [ "$dataseries" != "age" ]; then
         ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries age=persistent
         ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries age=ephemeral
       fi
@@ -12,7 +12,7 @@ for dataset in fedora_updates_systems epel fedora_rawhide_systems fedora_updates
         INC=$(echo "select $sub from checkins where dataset='$dataset' group by $sub;"|sqlite3 ./db/bronto.db )
         for inc in $INC; do
           ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries $sub=$inc
-          if [ "$dataseries" != "age"]; then
+          if [ "$dataseries" != "age" ]; then
             ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries $sub=$inc age=persistent
             ./brontosaurus_plotter.py timeseries $graph $dataset $dataseries $sub=$inc age=ephemeral
           fi
@@ -23,7 +23,7 @@ for dataset in fedora_updates_systems epel fedora_rawhide_systems fedora_updates
   for dataseries in age arch variant; do
     for graph in stacked share; do
       ./brontosaurus_plotter.py releasebar $graph $dataset $dataseries
-      if [ "$dataseries" != "age"]; then
+      if [ "$dataseries" != "age" ]; then
         ./brontosaurus_plotter.py releasebar $graph $dataset $dataseries age=persistent
         ./brontosaurus_plotter.py releasebar $graph $dataset $dataseries age=emphemeral
       fi

From 45a7469c65ad3d47a5c65ab465485cf10856d0e1 Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 26 2022 21:09:13 +0000
Subject: [PATCH 45/49] ok, definitely done for today :)


---

diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py
index 4a1fb71..230fdda 100755
--- a/brontosaurus_plotter.py
+++ b/brontosaurus_plotter.py
@@ -392,7 +392,7 @@ def main():
     for ext in config['image_types']:
         imagepath = Template(config['imagepath']).safe_substitute(madlibs)
         imagepath = Template(imagepath).safe_substitute({ 'filetype': ext })
-        if filter and not ( filterstring == "_age=ephemeral" or filterstring != "_age=persistant" ):
+        if filter and not ( filterstring == "_age=ephemeral" or filterstring == "_age=persistant" ):
             imagepath += "/filtered"
         graph.figure.savefig(f"{imagepath}/{Template(params['filebase']).safe_substitute(madlibs)}.{ext}",
                                 dpi=config['dpi'], bbox_inches="tight")

From bceef7b54df2e2fc1659a6682c6e88dbc2ba07f0 Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 26 2022 21:28:43 +0000
Subject: [PATCH 46/49] don't do "other" for age


---

diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py
index 230fdda..fbdc262 100755
--- a/brontosaurus_plotter.py
+++ b/brontosaurus_plotter.py
@@ -226,50 +226,52 @@ def main():
     #   datatable.resample('W-MON')
     #pprint(datatable)
 
-    # Find the items below thresholds for percent in any given
-    # dataseries entry, and also for excess number of items, 
-    # and bin them together into "other"
-    #
-    # We consider three things: 
-    # 
-    # * percent of at least one row (week or release
-    #   depending on chart type) must exceed minpercent
-    #
-    # * percent of total must also.
-    #
-    # * but wait, if the percent of any of the last four
-    #   rows is above the threshold, keep that after all
-    # 
-    # TODO: weight this towards the end of the data, so we don't drop
-    # emerging interesting things in favor of old news?
-    # 
-    # old way: toosmall = datatable.div(datatable.sum(axis=1), axis=0).max() < config['minpercent'] / 100
-    # 
-    toosmall = datatable[datatable==datatable.max()].div(datatable.sum(axis=1),axis=0).max() < config['minpercent'] / 100
-    toosmall |= datatable.sum()/datatable.sum().sum() < config['minpercent'] / 100
-    toosmall &= datatable[-4:].div(datatable.sum(axis=1),axis=0).fillna(0).max() < config['minpercent'] / 100
-    others = toosmall[toosmall == True].keys()
-    othercol = datatable[others].sum(axis=1).astype("Int64")    
-    datatable.drop(columns=others, inplace=True)
-
-    # still too big?
-    if len(datatable.columns) > config['maxitems']:
-        # the -1 in `config['maxitems']-1` is so we don't exceed the
-        # limit by adding the "others" column!
-        others = datatable.sum(axis=0).sort_values(ascending=False)[config['maxitems']-1:].index
-        othercol += datatable[others].sum(axis=1).astype("Int64")
-        datatable.drop(columns=others, inplace=True)
-
-    # if the remaining "other" ends up big enough to matter, add it to the table
-    # the division is: highest row (release, say) for the item, compared to the total for that row
-    if othercol.any() and othercol.max() / datatable.sum(axis=1).max() >= config['minpercent'] / 100:
-        datatable['other'] = othercol
 
-    # For bar charts, drop any rows (bars) which are below the threshold
-    if kind == 'bar':
-        toosmall=datatable.sum(axis=1)/datatable.sum(axis=1).max() < config['minpercent'] / 100
-        datatable.drop(toosmall[toosmall == True].keys(), inplace=True)
+    if dataseries != 'age':
+        # Find the items below thresholds for percent in any given
+        # dataseries entry, and also for excess number of items, 
+        # and bin them together into "other"
+        #
+        # We consider three things: 
+        # 
+        # * percent of at least one row (week or release
+        #   depending on chart type) must exceed minpercent
+        #
+        # * percent of total must also.
+        #
+        # * but wait, if the percent of any of the last four
+        #   rows is above the threshold, keep that after all
+        # 
+        # TODO: weight this towards the end of the data, so we don't drop
+        # emerging interesting things in favor of old news?
+        # 
+        # old way: toosmall = datatable.div(datatable.sum(axis=1), axis=0).max() < config['minpercent'] / 100
+        # 
+        toosmall = datatable[datatable==datatable.max()].div(datatable.sum(axis=1),axis=0).max() < config['minpercent'] / 100
+        toosmall |= datatable.sum()/datatable.sum().sum() < config['minpercent'] / 100
+        toosmall &= datatable[-4:].div(datatable.sum(axis=1),axis=0).fillna(0).max() < config['minpercent'] / 100
+        others = toosmall[toosmall == True].keys()
+        othercol = datatable[others].sum(axis=1).astype("Int64")    
+        datatable.drop(columns=others, inplace=True)
 
+        # still too big?
+        if len(datatable.columns) > config['maxitems']:
+            # the -1 in `config['maxitems']-1` is so we don't exceed the
+            # limit by adding the "others" column!
+            others = datatable.sum(axis=0).sort_values(ascending=False)[config['maxitems']-1:].index
+            othercol += datatable[others].sum(axis=1).astype("Int64")
+            datatable.drop(columns=others, inplace=True)
+
+        # if the remaining "other" ends up big enough to matter, add it to the table
+        # the division is: highest row (release, say) for the item, compared to the total for that row
+        if othercol.any() and othercol.max() / datatable.sum(axis=1).max() >= config['minpercent'] / 100:
+            datatable['other'] = othercol
+
+        # For bar charts, drop any rows (bars) which are below the threshold
+        if kind == 'bar':
+            toosmall=datatable.sum(axis=1)/datatable.sum(axis=1).max() < config['minpercent'] / 100
+            datatable.drop(toosmall[toosmall == True].keys(), inplace=True)
+    
 
     # If the items in this dataset aren't numeric,
     # sort columns by weight

From cbda02742ce46c969323affd59b3f3bf3be627c7 Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 26 2022 21:57:33 +0000
Subject: [PATCH 47/49] wip


---

diff --git a/TODO.md b/TODO.md
index 0bcaea5..317d876 100644
--- a/TODO.md
+++ b/TODO.md
@@ -10,6 +10,8 @@
  4. have some script that pre-renders some defaults (PARTIAL)
  5. and a simple front-end for exploring the rest
 
+* option to center date on highest peak (and given number of months on
+  each side
 
 * epel -- need to special-case EL 8 by-release graphs to add peak for
   both before and after CentOS Linux 8 EOL
diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py
index fbdc262..c865b40 100755
--- a/brontosaurus_plotter.py
+++ b/brontosaurus_plotter.py
@@ -373,19 +373,31 @@ def main():
             Template(params['subtitle']).safe_substitute(madlibs),
             fontsize=14)
 
-    plt.autoscale(enable=True, axis='x', tight=True)
+    #plt.autoscale(enable=True, axis='x', tight=True)
     plt.autoscale(enable=True, axis='y', tight=False)
     graph.set_ylim([0, None])
     graph.spines['right'].set_visible(False)
     graph.spines['top'].set_visible(False)
     graph.yaxis.set_major_formatter(yformatter)
-    # graph.xaxis.set_major_formatter(dates.DateFormatter('%b \'%y'))
     graph.set_xlabel('')
 
-    # aesthetic pickiness!
-    if kind == 'bar':
-        ax.tick_params(bottom=False)
+    
+
+    
 
+    match kind:
+        case 'bar':
+            # aesthetic pickiness!
+            ax.tick_params(bottom=False)
+        case _:
+            # pretty date labels
+
+            label_format = '{:,%b %Y}'
+            ax.xaxis.set_major_locator(m.ticker.MaxNLocator(3))
+            ticks_loc = ax.get_xticks().tolist()
+            ax.xaxis.set_major_locator(m.ticker.FixedLocator(ticks_loc))
+            ax.set_xticklabels([label_format.format(x) for x in ticks_loc])
+            ax.figure.autofmt_xdate(rotation=0, ha='center')
 
     # Not sure why these get rotated by default. Unrotate them!
     plt.xticks(rotation = 0)

From f01f7e58c3283e8ac1758379f6cb1466fac93a80 Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 27 2022 21:57:46 +0000
Subject: [PATCH 48/49] well. that was ridiculous. but it works!


---

diff --git a/TODO.md b/TODO.md
index 317d876..192982e 100644
--- a/TODO.md
+++ b/TODO.md
@@ -13,6 +13,8 @@
 * option to center date on highest peak (and given number of months on
   each side
 
+* consider making date in bronto.db be the _end_ of the week
+* 
 * epel -- need to special-case EL 8 by-release graphs to add peak for
   both before and after CentOS Linux 8 EOL
 
diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py
index c865b40..ee14361 100755
--- a/brontosaurus_plotter.py
+++ b/brontosaurus_plotter.py
@@ -11,13 +11,11 @@ Options:
 """
 
 
-import os
 import sqlite3
 from string import Template
 
 from pprint import pprint
 
-import pandas as pd
 import toml
 
 import re
@@ -25,8 +23,13 @@ import re
 import matplotlib as m
 import matplotlib.pyplot as plt
 
+
+import pandas as pd
+
 from docopt import docopt
 
+from datetime import datetime, timezone
+
 from collections import defaultdict
 
 from brontosaurusifier_utils import colormapping
@@ -325,6 +328,7 @@ def main():
                             colormap=colormap,
                             kind=kind,
                             stacked=stacked,
+                            xlim=[datetime.strptime("2021-01-01",'%Y-%m-%d'),datetime.strptime(timestamp,'%Y-%m-%d')]
                         )
 
     # Labels and titles and stuff.
@@ -373,7 +377,7 @@ def main():
             Template(params['subtitle']).safe_substitute(madlibs),
             fontsize=14)
 
-    #plt.autoscale(enable=True, axis='x', tight=True)
+    
     plt.autoscale(enable=True, axis='y', tight=False)
     graph.set_ylim([0, None])
     graph.spines['right'].set_visible(False)
@@ -383,21 +387,21 @@ def main():
 
     
-    
-
     match kind:
         case 'bar':
             # aesthetic pickiness!
             ax.tick_params(bottom=False)
+            plt.autoscale(enable=True, axis='x', tight=True)
         case _:
             # pretty date labels
-
-            label_format = '{:,%b %Y}'
-            ax.xaxis.set_major_locator(m.ticker.MaxNLocator(3))
-            ticks_loc = ax.get_xticks().tolist()
-            ax.xaxis.set_major_locator(m.ticker.FixedLocator(ticks_loc))
-            ax.set_xticklabels([label_format.format(x) for x in ticks_loc])
-            ax.figure.autofmt_xdate(rotation=0, ha='center')
+            # 
+            # This is horrific, but I can't get matplotlib and pandas to cooperate,
+            # so for some reason matplotlib sees this as week numbers.
+            ax.xaxis.set_major_formatter(m.ticker.FuncFormatter(lambda x, _: datetime.fromtimestamp(604800*x-1,timezone.utc).strftime("%B\n%Y")))
+            # hide last label so it doesn't overlap.
+            ltick=ax.xaxis.get_major_ticks()[-1]
+            ltick.label1.set_visible(False)
+            ltick.tick1line.set_visible(False)
 
     # Not sure why these get rotated by default. Unrotate them!
     plt.xticks(rotation = 0)
@@ -411,7 +415,7 @@ def main():
         graph.figure.savefig(f"{imagepath}/{Template(params['filebase']).safe_substitute(madlibs)}.{ext}",
                                 dpi=config['dpi'], bbox_inches="tight")
 
-    #graph.figure.savefig(f"images/test.png", dpi=config['dpi'], bbox_inches="tight")
+    #graph.figure.savefig(f"images/test.png", dpi=config['dpi']/2, bbox_inches="tight")
 
     plt.close(graph.figure)
     print(f"{Template(params['filebase']).safe_substitute(madlibs)}.{ext}")

From 3e1e7612c87e55efcd170a902754b160ded891e2 Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jul 28 2022 11:29:50 +0000
Subject: [PATCH 49/49] spelling


---

diff --git a/brontosaurus_plotter.py b/brontosaurus_plotter.py
index ee14361..5914abb 100755
--- a/brontosaurus_plotter.py
+++ b/brontosaurus_plotter.py
@@ -410,7 +410,7 @@ def main():
     for ext in config['image_types']:
         imagepath = Template(config['imagepath']).safe_substitute(madlibs)
         imagepath = Template(imagepath).safe_substitute({ 'filetype': ext })
-        if filter and not ( filterstring == "_age=ephemeral" or filterstring == "_age=persistant" ):
+        if filter and not ( filterstring == "_age=ephemeral" or filterstring == "_age=persistent" ):
             imagepath += "/filtered"
         graph.figure.savefig(f"{imagepath}/{Template(params['filebase']).safe_substitute(madlibs)}.{ext}",
                                 dpi=config['dpi'], bbox_inches="tight")