Skip to content

Clustering Info Container

idendrogram.ClusteringData

This class is used as a container to store underlying clustering data which may be used by callback functions in generating the dendrogram. Ensures expensive operations are calculated only once.

Example
#your clustering workflow
Z = scipy.cluster.hierarchy.linkage(...)
threshold = 42
cluster_assignments =  scipy.cluster.hierarchy.fcluster(Z, threshold=threshold, ...)        

#dendrogram creation
dd = idendrogram.idendrogram()
cdata = idendrogram.ClusteringData(
    linkage_matrix=Z, 
    cluster_assignments=cluster_assignments                
)
dd.set_cluster_info(cdata)
Source code in idendrogram/clustering_data.py
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
class ClusteringData:
    """This class is used as a container to store underlying clustering data which may be used by callback functions 
        in generating the dendrogram. Ensures expensive operations are calculated only once.  

        Example:

            ```
            #your clustering workflow
            Z = scipy.cluster.hierarchy.linkage(...)
            threshold = 42
            cluster_assignments =  scipy.cluster.hierarchy.fcluster(Z, threshold=threshold, ...)        

            #dendrogram creation
            dd = idendrogram.idendrogram()
            cdata = idendrogram.ClusteringData(
                linkage_matrix=Z, 
                cluster_assignments=cluster_assignments                
            )
            dd.set_cluster_info(cdata)
            ```        
        """

    have_leaders: bool = False
    leaders: np.ndarray
    flat_cluster_ids: np.ndarray
    have_tree: bool = False
    rootnode: sch.ClusterNode      
    nodelist: List[sch.ClusterNode]    
    linkage_matrix: np.ndarray     
    cluster_assignments: np.ndarray


    def __init__(
        self,
        linkage_matrix: np.ndarray,
        cluster_assignments: np.ndarray,        
        leaders: Tuple[np.ndarray, np.ndarray] = None,
        rootnode: sch.ClusterNode = None,
        nodelist: List[sch.ClusterNode] = None,
    ) -> None:
        """Set underlying clustering data that may be used by callback functions in generating the dendrogram. Ensures expensive operations are calculated only once.

        Args:
            linkage_matrix (np.ndarray): Linkage matrix as produced by 
                `scipy.cluster.hierarchy.linkage` or equivalent
            cluster_assignments (np.ndarray): A one dimensional array of length N that contains flat cluster assignments for each observation. Produced by `scipy.cluster.hierarchy.fcluster` or equivalent.            
            leaders (Tuple[np.ndarray, np.ndarray], optional): Root nodes of the clustering produced by `scipy.cluster.hierarchy.leaders()`. 
            rootnode (sch.ClusterNode, optional): rootnode produced by `scipy.cluster.hierarchy.to_tree(..., rd=True)`. 
            nodelist (List[sch.ClusterNode], optional): nodelist produced by `scipy.cluster.hierarchy.to_tree(..., rd=True)`

        Example:

            ```
            #your clustering workflow
            Z = scipy.cluster.hierarchy.linkage(...)
            threshold = 42
            cluster_assignments =  scipy.cluster.hierarchy.fcluster(Z, threshold=threshold, ...)        

            #dendrogram creation
            dd = idendrogram.idendrogram()
            cdata = idendrogram.ClusteringData(
                linkage_matrix=Z, 
                cluster_assignments=cluster_assignments,                 
            )
            dd.set_cluster_info(cdata)
            ```            
        """
        self.linkage_matrix = linkage_matrix
        self.cluster_assignments = cluster_assignments        
        if leaders is not None:
            self.have_leaders = True
            self.leaders = leaders[0]
            self.flat_cluster_ids = leaders[1]
        if rootnode and nodelist:
            self.have_tree = True
            self.rootnode = rootnode
            self.nodelist = nodelist

    def get_leaders(self) -> Tuple[np.ndarray, np.ndarray]:
        """A wrapper for [scipy.cluster.hierarchy.leaders](https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.leaders.html). Returns the root nodes in a hierarchical clustering.

        Returns:
            (Tuple[np.ndarray, np.ndarray]):  [L, M] (see SciPy's documentation for details)
        """
        if not self.have_leaders:
            L, M = sch.leaders(
                self.linkage_matrix, self.cluster_assignments
            )
            self.leaders = L
            self.flat_cluster_ids = M
            self.have_leaders = True
        return self.leaders, self.flat_cluster_ids

    def get_linkage_matrix(self) -> np.ndarray:
        """Returns stored linkage matrix.
        Returns:
            linkage_matrix (np.ndarray): Linkage matrix as produced by scipy.cluster.hierarchy.linkage or equivalent.
        """
        return self.linkage_matrix    

    def get_cluster_assignments(self) -> np.ndarray:
        """Returns flat cluster assignment array.

        Returns:
            cluster_assignments (np.ndarray): A one dimensional array of length N that contains flat cluster assignments for each observation. Produced by `scipy.cluster.hierarchy.fcluster` or equivalent.
        """
        return self.cluster_assignments

    def get_cluster_id(self, linkage_id: int) -> Optional[int]:
        """Returns flat cluster ID for a given linkage ID

        Args:
            linkage_id (int): Node linkage ID

        Returns:
            Optional[int]: CLuster ID if a node is within one cluster; None otherwise.
        """
        L, M = self.get_leaders()

        # check if we are above leaders already
        if linkage_id > L.max():
            return None

    # check if this is a leader node
        if linkage_id in L:
            return M[L == linkage_id][0]

        _, nodelist = self.get_tree()
        # Finally, if not grab first real leaf node of the passed id
        leaf_nodes = nodelist[linkage_id].pre_order(
            lambda x: x.id if x.is_leaf() else None
        )
        lf_node = leaf_nodes[0]
        # get its cluster assignment
        cluster = self.cluster_assignments[lf_node]
        return cluster

    def get_tree(self) -> Tuple[sch.ClusterNode, List[sch.ClusterNode]]:
        """A wrapper for [scipy.cluster.hierarchy.to_tree](https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.to_tree.html). Converts a linkage matrix into an easy-to-use tree object.

        Returns:
            Tuple[scipy.cluster.hierarchy.ClusterNode, List[scipy.cluster.hierarchy.ClusterNode]]: [rootnode, nodelist] (see SciPy's documentation for details)
        """
        if not self.have_tree:
            rootnode, nodelist = sch.to_tree(self.linkage_matrix, rd=True)
            self.rootnode = rootnode
            self.nodelist = nodelist
            self.have_tree = True
        return self.rootnode, self.nodelist

    def get_merge_map(self) -> dict:
        """Returns a dictionary that maps pairs of linkage matrix IDs to the linkage matrix ID they get merged into.

        Returns:
            dict: Dictionary tuple(ID, ID) -> merged_ID
        """

        #create keys that are represented by the pairs of cluster_ids to be merged 
        #e.g. component_ids = [(1,2), (3,4), (5,6)]
        component_ids = zip(
            self.linkage_matrix[:, 0].astype(int),
            self.linkage_matrix[:, 1].astype(int),
        )
        #create IDs of the clusters resulting from the merges, i.e. if (1,2) get merged into 5 and (3,4) get merged into 6, 
        # and then (5,6) get merged into 7, this will be [5,6,7]
        merged_ids = np.arange(
            self.linkage_matrix.shape[0] + 1,
            (self.linkage_matrix.shape[0] + 1) * 2 - 1,
        )

        #create a dictionary that allows to look up a ID resulting from a merge
        merge_map = dict(zip(component_ids, merged_ids))
        return merge_map

__init__(linkage_matrix, cluster_assignments, leaders=None, rootnode=None, nodelist=None)

Set underlying clustering data that may be used by callback functions in generating the dendrogram. Ensures expensive operations are calculated only once.

Parameters:

Name Type Description Default
linkage_matrix np.ndarray

Linkage matrix as produced by scipy.cluster.hierarchy.linkage or equivalent

required
cluster_assignments np.ndarray

A one dimensional array of length N that contains flat cluster assignments for each observation. Produced by scipy.cluster.hierarchy.fcluster or equivalent.

required
leaders Tuple[np.ndarray, np.ndarray]

Root nodes of the clustering produced by scipy.cluster.hierarchy.leaders().

None
rootnode sch.ClusterNode

rootnode produced by scipy.cluster.hierarchy.to_tree(..., rd=True).

None
nodelist List[sch.ClusterNode]

nodelist produced by scipy.cluster.hierarchy.to_tree(..., rd=True)

None
Example
#your clustering workflow
Z = scipy.cluster.hierarchy.linkage(...)
threshold = 42
cluster_assignments =  scipy.cluster.hierarchy.fcluster(Z, threshold=threshold, ...)        

#dendrogram creation
dd = idendrogram.idendrogram()
cdata = idendrogram.ClusteringData(
    linkage_matrix=Z, 
    cluster_assignments=cluster_assignments,                 
)
dd.set_cluster_info(cdata)
Source code in idendrogram/clustering_data.py
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
def __init__(
    self,
    linkage_matrix: np.ndarray,
    cluster_assignments: np.ndarray,        
    leaders: Tuple[np.ndarray, np.ndarray] = None,
    rootnode: sch.ClusterNode = None,
    nodelist: List[sch.ClusterNode] = None,
) -> None:
    """Set underlying clustering data that may be used by callback functions in generating the dendrogram. Ensures expensive operations are calculated only once.

    Args:
        linkage_matrix (np.ndarray): Linkage matrix as produced by 
            `scipy.cluster.hierarchy.linkage` or equivalent
        cluster_assignments (np.ndarray): A one dimensional array of length N that contains flat cluster assignments for each observation. Produced by `scipy.cluster.hierarchy.fcluster` or equivalent.            
        leaders (Tuple[np.ndarray, np.ndarray], optional): Root nodes of the clustering produced by `scipy.cluster.hierarchy.leaders()`. 
        rootnode (sch.ClusterNode, optional): rootnode produced by `scipy.cluster.hierarchy.to_tree(..., rd=True)`. 
        nodelist (List[sch.ClusterNode], optional): nodelist produced by `scipy.cluster.hierarchy.to_tree(..., rd=True)`

    Example:

        ```
        #your clustering workflow
        Z = scipy.cluster.hierarchy.linkage(...)
        threshold = 42
        cluster_assignments =  scipy.cluster.hierarchy.fcluster(Z, threshold=threshold, ...)        

        #dendrogram creation
        dd = idendrogram.idendrogram()
        cdata = idendrogram.ClusteringData(
            linkage_matrix=Z, 
            cluster_assignments=cluster_assignments,                 
        )
        dd.set_cluster_info(cdata)
        ```            
    """
    self.linkage_matrix = linkage_matrix
    self.cluster_assignments = cluster_assignments        
    if leaders is not None:
        self.have_leaders = True
        self.leaders = leaders[0]
        self.flat_cluster_ids = leaders[1]
    if rootnode and nodelist:
        self.have_tree = True
        self.rootnode = rootnode
        self.nodelist = nodelist

get_cluster_assignments()

Returns flat cluster assignment array.

Returns:

Name Type Description
cluster_assignments np.ndarray

A one dimensional array of length N that contains flat cluster assignments for each observation. Produced by scipy.cluster.hierarchy.fcluster or equivalent.

Source code in idendrogram/clustering_data.py
114
115
116
117
118
119
120
def get_cluster_assignments(self) -> np.ndarray:
    """Returns flat cluster assignment array.

    Returns:
        cluster_assignments (np.ndarray): A one dimensional array of length N that contains flat cluster assignments for each observation. Produced by `scipy.cluster.hierarchy.fcluster` or equivalent.
    """
    return self.cluster_assignments

get_cluster_id(linkage_id)

Returns flat cluster ID for a given linkage ID

Parameters:

Name Type Description Default
linkage_id int

Node linkage ID

required

Returns:

Type Description
Optional[int]

Optional[int]: CLuster ID if a node is within one cluster; None otherwise.

Source code in idendrogram/clustering_data.py
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
def get_cluster_id(self, linkage_id: int) -> Optional[int]:
    """Returns flat cluster ID for a given linkage ID

    Args:
        linkage_id (int): Node linkage ID

    Returns:
        Optional[int]: CLuster ID if a node is within one cluster; None otherwise.
    """
    L, M = self.get_leaders()

    # check if we are above leaders already
    if linkage_id > L.max():
        return None

# check if this is a leader node
    if linkage_id in L:
        return M[L == linkage_id][0]

    _, nodelist = self.get_tree()
    # Finally, if not grab first real leaf node of the passed id
    leaf_nodes = nodelist[linkage_id].pre_order(
        lambda x: x.id if x.is_leaf() else None
    )
    lf_node = leaf_nodes[0]
    # get its cluster assignment
    cluster = self.cluster_assignments[lf_node]
    return cluster

get_leaders()

A wrapper for scipy.cluster.hierarchy.leaders. Returns the root nodes in a hierarchical clustering.

Returns:

Type Description
Tuple[np.ndarray, np.ndarray]

[L, M] (see SciPy's documentation for details)

Source code in idendrogram/clustering_data.py
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
def get_leaders(self) -> Tuple[np.ndarray, np.ndarray]:
    """A wrapper for [scipy.cluster.hierarchy.leaders](https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.leaders.html). Returns the root nodes in a hierarchical clustering.

    Returns:
        (Tuple[np.ndarray, np.ndarray]):  [L, M] (see SciPy's documentation for details)
    """
    if not self.have_leaders:
        L, M = sch.leaders(
            self.linkage_matrix, self.cluster_assignments
        )
        self.leaders = L
        self.flat_cluster_ids = M
        self.have_leaders = True
    return self.leaders, self.flat_cluster_ids

get_linkage_matrix()

Returns stored linkage matrix.

Returns:

Name Type Description
linkage_matrix np.ndarray

Linkage matrix as produced by scipy.cluster.hierarchy.linkage or equivalent.

Source code in idendrogram/clustering_data.py
107
108
109
110
111
112
def get_linkage_matrix(self) -> np.ndarray:
    """Returns stored linkage matrix.
    Returns:
        linkage_matrix (np.ndarray): Linkage matrix as produced by scipy.cluster.hierarchy.linkage or equivalent.
    """
    return self.linkage_matrix    

get_merge_map()

Returns a dictionary that maps pairs of linkage matrix IDs to the linkage matrix ID they get merged into.

Returns:

Name Type Description
dict dict

Dictionary tuple(ID, ID) -> merged_ID

Source code in idendrogram/clustering_data.py
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
def get_merge_map(self) -> dict:
    """Returns a dictionary that maps pairs of linkage matrix IDs to the linkage matrix ID they get merged into.

    Returns:
        dict: Dictionary tuple(ID, ID) -> merged_ID
    """

    #create keys that are represented by the pairs of cluster_ids to be merged 
    #e.g. component_ids = [(1,2), (3,4), (5,6)]
    component_ids = zip(
        self.linkage_matrix[:, 0].astype(int),
        self.linkage_matrix[:, 1].astype(int),
    )
    #create IDs of the clusters resulting from the merges, i.e. if (1,2) get merged into 5 and (3,4) get merged into 6, 
    # and then (5,6) get merged into 7, this will be [5,6,7]
    merged_ids = np.arange(
        self.linkage_matrix.shape[0] + 1,
        (self.linkage_matrix.shape[0] + 1) * 2 - 1,
    )

    #create a dictionary that allows to look up a ID resulting from a merge
    merge_map = dict(zip(component_ids, merged_ids))
    return merge_map

get_tree()

A wrapper for scipy.cluster.hierarchy.to_tree. Converts a linkage matrix into an easy-to-use tree object.

Returns:

Type Description
Tuple[sch.ClusterNode, List[sch.ClusterNode]]

Tuple[scipy.cluster.hierarchy.ClusterNode, List[scipy.cluster.hierarchy.ClusterNode]]: [rootnode, nodelist] (see SciPy's documentation for details)

Source code in idendrogram/clustering_data.py
151
152
153
154
155
156
157
158
159
160
161
162
def get_tree(self) -> Tuple[sch.ClusterNode, List[sch.ClusterNode]]:
    """A wrapper for [scipy.cluster.hierarchy.to_tree](https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.to_tree.html). Converts a linkage matrix into an easy-to-use tree object.

    Returns:
        Tuple[scipy.cluster.hierarchy.ClusterNode, List[scipy.cluster.hierarchy.ClusterNode]]: [rootnode, nodelist] (see SciPy's documentation for details)
    """
    if not self.have_tree:
        rootnode, nodelist = sch.to_tree(self.linkage_matrix, rd=True)
        self.rootnode = rootnode
        self.nodelist = nodelist
        self.have_tree = True
    return self.rootnode, self.nodelist