From a21584d803f8247806b40024c7e680199f2698d9 Mon Sep 17 00:00:00 2001
From: Michael Reuscher <michael.reuscher@desy.de>
Date: Wed, 26 Jul 2023 17:02:13 +0200
Subject: [PATCH] statistics plot 1.2

---
 visualizer/visualizer/DataFrameCreator.py | 30 ++++-----
 visualizer/visualizer/SeabornPlotter.py   | 78 +++++++----------------
 2 files changed, 39 insertions(+), 69 deletions(-)

diff --git a/visualizer/visualizer/DataFrameCreator.py b/visualizer/visualizer/DataFrameCreator.py
index cc518f1..64621bf 100644
--- a/visualizer/visualizer/DataFrameCreator.py
+++ b/visualizer/visualizer/DataFrameCreator.py
@@ -37,8 +37,20 @@ class DataFrameCreator:
             datasize = last_numbers
         else:
             datasize = 0
-        group = re.search(r"(.*):", data).group(1)
-        result["Group"] = group
+        group_match = re.search(r"(.*):", data)
+        if group_match:
+            group = group_match.group(1)
+            thread_match = re.search(r"thread \d+", group)
+            if thread_match:
+                thread = thread_match.group()
+                result["thread"] = thread
+                group = group.replace(thread, "").strip()
+            else:
+                result["thread"] = ""
+            result["Group"] = group
+        else:
+            result["Group"] = ""
+            result["thread"] = ""
         result["size"] = int(datasize)
 
         values = re.findall(r"\d+\.\d+", data)
@@ -52,16 +64,6 @@ class DataFrameCreator:
 
         return result
 
-    @staticmethod
-    def _extract_avg(data):
-        result = {}
-        match = re.search(r"(\d+\.\d+)", data)
-        if match:
-            result["avg"] = float(match.group())
-        else:
-            result["avg"] = 0.0
-        return result
-
     def read_various_data(self, root, filename):
         dataframe, dataframe_type, dataframe_name = self._set_dataframe(root, filename)
         if dataframe_type in self.v_dfs:
@@ -88,9 +90,6 @@ class DataFrameCreator:
                 if data.startswith("worker") or data.startswith("posix"):
                     data_dict = self._extract_statistics(data, filename)
                     data_list.append(data_dict)
-                elif data.startswith("average"):
-                    data_dict = self._extract_avg(data)
-                    data_list.append(data_dict)
 
             dataframe = pd.DataFrame(data_list)
             dataframe_type = filename
@@ -147,3 +146,4 @@ class DataFrameCreator:
         statistics_dfs = self._merge_dataframes(self.s_dfs)
 
         return various_dfs, long_dfs, statistics_dfs
+
diff --git a/visualizer/visualizer/SeabornPlotter.py b/visualizer/visualizer/SeabornPlotter.py
index 0509ac5..a09dea0 100644
--- a/visualizer/visualizer/SeabornPlotter.py
+++ b/visualizer/visualizer/SeabornPlotter.py
@@ -12,15 +12,6 @@ class SeabornPlotter:
         self.dataframe = dataframe
         self.name = name
 
-    @staticmethod
-    def format_speed(speed_gibs):
-        # Convert speed from GiB/s to MiB/s
-        speed_mibs = speed_gibs * 1024
-
-        # Round the speed to two decimal places
-        speed_mibs_rounded = round(speed_mibs, 2)
-        return speed_mibs_rounded
-
     @staticmethod
     def _add_logo(logo_path, logo_size, logo_x, logo_y):
         current_directory = os.path.dirname(os.path.abspath(__file__))
@@ -77,51 +68,30 @@ class SeabornPlotter:
         plt.savefig(title + ".svg", format='svg')
         plt.show()
 
-    def plot_avg_speed(self, speed_dict):
-        data_list = list(speed_dict.items())
-        df = pd.DataFrame(data_list, columns=["Data_set", "MiB/s"])
-
-        # Plot erstellen
-        plt.figure(figsize=(12, 6))
-        sns.lineplot(x=df["Data_set"], y=df["MiB/s"], marker='o')
-        plt.xticks(rotation=90)  # Rotiere die x-Achsenbeschriftungen für bessere Lesbarkeit
-        self._add_logo("desy_logo.png", logo_size=0.1, logo_x=1.065, logo_y=1.07)
-        plt.xlabel('Data_Set')
-        plt.yscale('log')
-        plt.ylabel('MiB/s')
-        plt.title('AVG Speed')
-        plt.tight_layout()
-        plt.show()
-
     def plot_statistics(self):
         dataframes = self.dataframe
-        avg_speed_dict = {}
-        # Sort the dataframes based on the 'size' column in ascending order
-        sorted_dataframes = sorted(dataframes.items(), key=lambda x: x[1]['size'].min())
 
-        num_plots = len(sorted_dataframes)  # Number of DataFrames in the dictionary
-        rows = 6  # Number of rows in the grid
-        cols = (num_plots + 1) // rows  # Number of columns in the grid
-
-        # Subplots creation
-
-        fig, axes = plt.subplots(rows, cols, figsize=(180, 120), constrained_layout=True)
-        fig.suptitle('Statistical Measures for Each Data_Set', fontsize=14)
-        # For each DataFrame, create a boxplot and place it into the corresponding subplot
-        for idx, (df_key, df) in enumerate(sorted_dataframes):
-            sns.boxplot(data=df, order=['min', 'max', 'mean', 'med', '10%', '90%'], ax=axes[idx // cols, idx % cols], palette='pastel')
-
-            axes[idx // cols, idx % cols].set_yscale('log')
-            axes[idx // cols, idx % cols].set_ylabel('Time (Y-Axis)')
-            axes[idx // cols, idx % cols].set_title(f'Group {df_key}', fontsize=12)
-            axes[idx // cols, idx % cols].grid(True)
-            first_valid_index = df['avg'].first_valid_index()
-            speed_value = df.loc[first_valid_index, 'avg']
-            avg_speed_dict[df_key] = self.format_speed(speed_value)
-
-        # Hide empty subplots, if any
-        for idx in range(num_plots, rows * cols):
-            fig.delaxes(axes.flatten()[idx])
-
-        plt.show()
-        self.plot_avg_speed(avg_speed_dict)
+        for df_key, df in dataframes.items():
+            operations = df['Group'].unique()
+            num_plots = len(operations)
+            rows = 2  # Number of rows in the grid
+            cols = (num_plots + 1) // rows  # Number of columns in the grid
+            fig, axes = plt.subplots(rows, cols, figsize=(15, 8), constrained_layout=True)
+            fig.suptitle(f'Measures for {df_key}', fontsize=14)
+
+            global_min_speed = min(df['speed'].min() for df in dataframes.values()) + 1e-5
+            global_max_speed = max(df['speed'].max() for df in dataframes.values())
+
+            # Create a grid of boxplots for each operation
+            for idx, operation in enumerate(operations):
+                operation_df = df[df['Group'] == operation]
+                ax = axes[idx // cols, idx % cols]
+                sns.barplot(data=operation_df, errorbar=None, order=['min', 'max', 'mean', 'med', '10%', '90%'], ax=ax, palette='pastel')
+                ax.set_yscale('log')
+                ax.set_ylabel('Time (Y-Axis)')
+                ax.set_title(f'Operation: {operation}', fontsize=12)
+                ax.grid(True)
+                ax.set_ylim(bottom=global_min_speed, top=global_max_speed)   # Set the Y-axis limits
+
+            plt.savefig(df_key + ".svg", format='svg')
+            plt.show()
-- 
GitLab