From f8cf55466a0497372a6e9724595bb2a8f5460a19 Mon Sep 17 00:00:00 2001
From: jmmshn <jmmshn@gmail.com>
Date: Fri, 9 Apr 2021 09:38:00 -0700
Subject: [PATCH 1/9] updated structure grouper

- Added fields for StructureMatcher tolerances
- Allowed the kwargs to be passed during construction
---
 emmet-core/emmet/core/structure_group.py | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/emmet-core/emmet/core/structure_group.py b/emmet-core/emmet/core/structure_group.py
index 9d1cf16225..b162249d71 100644
--- a/emmet-core/emmet/core/structure_group.py
+++ b/emmet-core/emmet/core/structure_group.py
@@ -11,7 +11,8 @@
 from pymatgen.entries.computed_entries import ComputedEntry, ComputedStructureEntry
 from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
 
-from emmet.core.mpid import MPID
+__author__ = "Jimmy Shen"
+__email__ = "jmmshn@gmail.com"
 
 logger = logging.getLogger(__name__)
 
@@ -79,6 +80,18 @@ class StructureGroupDoc(BaseModel):
         "present the chemsys will also include the ignored species.",
     )
 
+    ltol: float = Field(
+        None, description="Lattice length tolerance parameter for the StructureMatcher."
+    )
+
+    stol: float = Field(
+        None, description="site position tolerance parameter for the StructureMatcher."
+    )
+
+    angle_tol: float = Field(
+        None, description="Bond angle tolerance parameter for the StructureMatcher."
+    )
+
     last_updated: datetime = Field(
         None,
         description="Timestamp when this document was built.",
@@ -94,6 +107,7 @@ def from_grouped_entries(
         cls,
         entries: List[Union[ComputedEntry, ComputedStructureEntry]],
         ignored_species: List[str],
+        **kwargs,
     ) -> "StructureGroupDoc":
         """ "
         Assuming a list of entries are already grouped together, create a StructureGroupDoc
@@ -125,6 +139,7 @@ def from_grouped_entries(
             "chemsys": "-".join(sorted(all_atoms | set(ignored_species))),
             "has_distinct_compositions": len(all_comps) > 1,
         }
+        fields.update(kwargs)
 
         return cls(**fields)
 
@@ -184,7 +199,11 @@ def from_ungrouped_structure_entries(
             )
             for g in group_entries_with_structure_matcher(f_group_l, sm):
                 struct_group = cls.from_grouped_entries(
-                    g, ignored_species=ignored_species
+                    g,
+                    ignored_species=ignored_species,
+                    ltol=ltol,
+                    stol=stol,
+                    angle_tol=angle_tol,
                 )
                 cnt_ += len(struct_group.material_ids)
                 results.append(struct_group)

From 711200b54fe4fdae1713b1e5f83c21ba3eacfc7d Mon Sep 17 00:00:00 2001
From: jmmshn <jmmshn@gmail.com>
Date: Sun, 11 Apr 2021 10:24:11 -0700
Subject: [PATCH 2/9] added migration builder

---
 .../emmet/builders/materials/electrodes.py    |  5 ++
 tests/emmet-core/test_migration.py            | 57 +++++++++++++++++++
 2 files changed, 62 insertions(+)
 create mode 100644 tests/emmet-core/test_migration.py

diff --git a/emmet-builders/emmet/builders/materials/electrodes.py b/emmet-builders/emmet/builders/materials/electrodes.py
index 5d782d4d3c..baccec3346 100644
--- a/emmet-builders/emmet/builders/materials/electrodes.py
+++ b/emmet-builders/emmet/builders/materials/electrodes.py
@@ -429,3 +429,8 @@ def update_targets(self, items: List):
             self.insertion_electrode.update(docs=items, key=["battery_id"])
         else:
             self.logger.info("No items to update")
+
+
+# class MigrationGraphBuilder(InsertionElectrodeBuilder):
+#     def process_item(self, item) -> Dict:
+#         pass
diff --git a/tests/emmet-core/test_migration.py b/tests/emmet-core/test_migration.py
new file mode 100644
index 0000000000..ed966e3a70
--- /dev/null
+++ b/tests/emmet-core/test_migration.py
@@ -0,0 +1,57 @@
+import pytest
+from monty.serialization import loadfn
+from pymatgen.apps.battery.conversion_battery import ConversionElectrode
+from pymatgen.apps.battery.insertion_battery import InsertionElectrode
+from pymatgen.core import Composition
+from pymatgen.entries.computed_entries import ComputedEntry
+
+from emmet.core.electrode import (
+    ConversionElectrodeDoc,
+    ConversionVoltagePairDoc,
+    InsertionElectrodeDoc,
+    InsertionVoltagePairDoc,
+)
+from emmet.core.migration import MigrationGraphDoc
+
+__author__ = "Jimmy Shen"
+__email__ = "jmmshn@gmail.com"
+
+
+@pytest.fixture(scope="session")
+def entries_lto(test_dir):
+    """
+    Recycle the test cases from pymatgen
+    """
+    entries = loadfn(test_dir / "LiTiO2_batt.json")
+    for itr, ient in enumerate(entries):
+        ient.entry_id = f"mp-{itr}"
+    return entries
+
+
+@pytest.fixture(scope="session")
+def insertion_elec(test_dir) -> InsertionElectrode:
+    """
+    Insertion electrod object
+    """
+    entry_Li = ComputedEntry("Li", -1.90753119)
+    entries_LTO = loadfn(test_dir / "LiTiO2_batt.json")
+    return InsertionElectrode.from_entries(entries_LTO, entry_Li)
+
+
+def test_StructureGroupDoc_from_ungrouped_entries(insertion_elec: InsertionElectrode):
+    entries = insertion_elec.get_stable_entries()
+    entry_li = ComputedEntry("Li", -1.90753119)
+    dist_thresh = 4
+    mg = MigrationGraphDoc.from_entries(
+        entries=entries,
+        working_ion_entry=entry_li,
+        ltol=0.4,
+        stol=0.6,
+        angle_tol=15,
+        symprec=0.1,
+        min_distance_cutoff=dist_thresh,
+    )
+
+    assert len(mg.migration_graph_object.only_sites) == 6
+    for u, v, d in mg.migration_graph_object.m_graph.graph.edges(data=True):
+        assert d["hop_distance"] < dist_thresh

From de71bb89e938b9ccd7d5918bff4bf5d3b0768304 Mon Sep 17 00:00:00 2001
From: jmmshn <jmmshn@gmail.com>
Date: Sun, 11 Apr 2021 10:25:13 -0700
Subject: [PATCH 3/9] added migration document model

requires new pymatgen-diffusion PR
---
 emmet-core/emmet/core/migration.py | 213 +++++++++++++++++++++++++++++
 1 file changed, 213 insertions(+)
 create mode 100644 emmet-core/emmet/core/migration.py

diff --git a/emmet-core/emmet/core/migration.py b/emmet-core/emmet/core/migration.py
new file mode 100644
index 0000000000..a7a13ce262
--- /dev/null
+++ b/emmet-core/emmet/core/migration.py
@@ -0,0 +1,213 @@
+import logging
+import math
+import operator
+from datetime import datetime
+from itertools import groupby
+from typing import Iterable, List, Union
+
+from docutils.nodes import Element
+from monty.json import MontyDecoder
+from pydantic import BaseModel, Field, validator
+from pymatgen.analysis.diffusion.neb.full_path_mapper import MigrationGraph
+from pymatgen.analysis.graphs import StructureGraph
+from pymatgen.analysis.structure_matcher import ElementComparator, StructureMatcher
+from pymatgen.core import Composition, Structure
+from pymatgen.entries.computed_entries import ComputedEntry, ComputedStructureEntry
+from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
+
+from emmet.core.structure_group import StructureGroupDoc
+
+__author__ = "Jimmy Shen"
+__email__ = "jmmshn@gmail.com"
+
+logger = logging.getLogger(__name__)
+
+
+class MigrationGraphDoc(BaseModel):
+    """
+    Migration Graph
+    """
+
+    battery_id: str = Field(
+        None,
+        description="The id for this migration graph document, shared with "
+        "insertion electrode since the same kind of structure "
+        "grouping is performed.",
+    )
+
+    # host_structure: Structure = Field(
+    #     None,
+    #     description="Host structure (structure without the working ion)",
+    # )
+    #
+    # migration_graph: StructureGraph = Field(
+    #     None,
+    #     description="The StructureGraph object that contains all of the migration sites"
+    # )
+    #
+    # framework: Composition = Field(
+    #     None,
+    #     description="The chemical compositions of the host framework",
+    # )
+    #
+    # elements: List[Element] = Field(
+    #     None,
+    #     description="The atomic species contained in the host structure (not including the working ion).",
+    # )
+    #
+    # nelements: int = Field(
+    #     None,
+    #     description="The number of elements in the material (not including the working ion).",
+    # )
+    #
+    # chemsys: str = Field(
+    #     None,
+    #     description="The chemical system the host lattice belongs to (not including the working ion)",
+    # )
+
+    found_path: bool = Field(
+        None, description="True, if an intercalating path is found."
+    )
+
+    ltol: float = Field(
+        None, description="Lattice length tolerance parameter for the StructureMatcher."
+    )
+
+    stol: float = Field(
+        None, description="site position tolerance parameter for the StructureMatcher."
+    )
+
+    angle_tol: float = Field(
+        None, description="Bond angle tolerance parameter for the StructureMatcher."
+    )
+
+    symprec: float = Field(None, description="SPGLIB tolerance parameter.")
+
+    migration_graph_object: MigrationGraph = Field(
+        None, description="The migration pathway object forom " "pymatgen-diffussion."
+    )
+
+    barrier: float = Field(
+        None,
+        description="The highest energy difference along the path with the "
+        "lowest cumulative absolute energy difference.",
+    )
+
+    last_updated: datetime = Field(
+        None,
+        description="Timestamp when this document was built.",
+    )
+
+    # Make sure that the datetime field is properly formatted
+    @validator("last_updated", pre=True)
+    def last_updated_dict_ok(cls, v):
+        return MontyDecoder().process_decoded(v)
+
+    @classmethod
+    def from_entries(
+        cls,
+        entries: List[ComputedStructureEntry],
+        working_ion_entry: ComputedEntry,
+        ltol: float,
+        stol: float,
+        angle_tol: float,
+        symprec: float,
+        min_distance_cutoff: float = 5.0,
+        max_distance_cutoff: float = 10.0,
+        **kwargs,
+    ) -> Union["MigrationGraphDoc", None]:
+        """
+        Parse a list of entries and construct the migration graph.
+        The tolerances must be explicitly provided.
+        Args:
+            entries: A list of entries that is already grouped together.
+            working_ion_entry: Computed entry containing the metallic phase of the working ion.
+            ltol: length tolerance parameter
+            stol: site tolerance parameter
+            angle_tol: angular tolerance parameter
+            symprec: SPGLIB tolerance parameter
+            min_distance_cutoff: The initial guess for the bonding distance, if no intercalation pathways are found,
+                                    the threshold will be increased by 1 Angstrom until max_distance_cutoff
+            max_distance_cutoff: The maximum we allowed to increase the distance cutoff to look for
+                                    intercalation pathways
+            kwargs: Additional kwargs to help search and filter, should be taken directly from electrode document.
+
+        Returns:
+            A MigrationGraphDocument
+        """
+        migrating_species = working_ion_entry.composition.reduced_formula
+        cur_id = kwargs.get("battery_id", "MISSING battery_id")
+
+        slist = MigrationGraph.get_structure_from_entries(
+            entries=entries,
+            migrating_ion_entry=working_ion_entry,
+            ltol=ltol,
+            stol=stol,
+            angle_tol=angle_tol,
+            symprec=symprec,
+        )
+
+        if len(slist) == 0:
+            logger.warning(
+                f"No structure with meta-stable sites could be generate for id: [{cur_id}]"
+            )
+            return None
+
+        struct = slist[0]
+        d_cut = min_distance_cutoff
+        mg = None
+        while d_cut <= max_distance_cutoff:
+            mg = MigrationGraph.with_distance(
+                structure=struct,
+                migrating_specie=migrating_species,
+                max_distance=d_cut,
+                symprec=0.01,
+            )
+            mg.assign_cost_to_graph()
+            u, path_hops = next(mg.get_path())
+            if len(path_hops) != 0:
+                break
+            d_cut += 1.0
+
+        if mg is None:
+            logger.warning(f"No Migration graph could be generate for id: [{cur_id}]")
+            return None
+
+        # adding the energy difference
+        for lab, d in mg.unique_hops.items():
+            e_u = mg.only_sites.sites[d["iindex"]].properties["insertion_energy"]
+            e_v = mg.only_sites.sites[d["eindex"]].properties["insertion_energy"]
+            ediff = abs(e_u - e_v)
+            mg.add_data_to_similar_edges(
+                target_label=d["hop_label"], data={"ediff": ediff}
+            )
+
+        mg.assign_cost_to_graph(cost_keys=["ediff"])
+        lowest_cost, best_path = math.inf, []
+
+        for u, path in mg.get_path():
+            cum_cost = sum([hop["cost"] for hop in path])
+            if cum_cost < lowest_cost:
+                lowest_cost, best_path = cum_cost, path
+
+        all_sites_along_path = set()
+
+        for hop in best_path:
+            all_sites_along_path |= {hop["iindex"], hop["eindex"]}
+
+        site_energies = [
+            mg.only_sites.sites[ii_].properties["insertion_energy"]
+            for ii_ in all_sites_along_path
+        ]
+        barrier = max(site_energies) - min(site_energies)
+
+        fields = {
+            "ltol": ltol,
+            "stol": stol,
+            "angle_tol": angle_tol,
+            "migration_graph_object": mg,
+            "barrier": barrier,
+        }
+
+        fields.update(kwargs)
+        return cls(**fields)

From ec78f33280c59c58422bc30366fd261bb39e7a47 Mon Sep 17 00:00:00 2001
From: jmmshn <jmmshn@gmail.com>
Date: Wed, 14 Apr 2021 12:30:52 -0700
Subject: [PATCH 4/9] refactor or electrode builders

- moved structure group builder to it's own file
- created base builder class  GroupedThermoDocsBuilder that just grabs group docs and thermo docs
- InsertionElectrodeBuilder is now a subclass
---
 .../emmet/builders/materials/electrodes.py    | 347 +++---------------
 .../builders/materials/structure_group.py     | 280 ++++++++++++++
 2 files changed, 336 insertions(+), 291 deletions(-)
 create mode 100644 emmet-builders/emmet/builders/materials/structure_group.py

diff --git a/emmet-builders/emmet/builders/materials/electrodes.py b/emmet-builders/emmet/builders/materials/electrodes.py
index baccec3346..95c01ac148 100644
--- a/emmet-builders/emmet/builders/materials/electrodes.py
+++ b/emmet-builders/emmet/builders/materials/electrodes.py
@@ -24,294 +24,45 @@
 __email__ = "jmmshn@lbl.gov"
 
 
-def s_hash(el):
-    return el.data["comp_delith"]
-
-
-# MatDoc = namedtuple("MatDoc", ["material_id", "structure", "formula_pretty", "framework"])
-
-REDOX_ELEMENTS = [
-    "Ti",
-    "V",
-    "Cr",
-    "Mn",
-    "Fe",
-    "Co",
-    "Ni",
-    "Cu",
-    "Nb",
-    "Mo",
-    "Sn",
-    "Sb",
-    "W",
-    "Re",
-    "Bi",
-    "C",
-    "Hf",
-]
-
-WORKING_IONS = ["Li", "Be", "Na", "Mg", "K", "Ca", "Rb", "Sr", "Cs", "Ba"]
-
-MAT_PROPS = ["structure", "material_id", "formula_pretty", "entries"]
-
-sg_fields = ["number", "hall_number", "international", "hall", "choice"]
-
-
-def generic_groupby(list_in, comp=operator.eq):
+class GroupedThermoDocsBuilder(Builder):
     """
-    Group a list of unsortable objects
-    Args:
-        list_in: A list of generic objects
-        comp: (Default value = operator.eq) The comparator
-    Returns:
-        [int] list of labels for the input list
+    Used grouped ID to fetch entries from the thermo collection
+    This can be subclassed to accomplish more things with the entries
     """
-    list_out = [None] * len(list_in)
-    label_num = 0
-    for i1, ls1 in enumerate(list_out):
-        if ls1 is not None:
-            continue
-        list_out[i1] = label_num
-        for i2, ls2 in list(enumerate(list_out))[i1 + 1 :]:
-            if comp(list_in[i1], list_in[i2]):
-                if list_out[i2] is None:
-                    list_out[i2] = list_out[i1]
-                else:
-                    list_out[i1] = list_out[i2]
-                    label_num -= 1
-        label_num += 1
-    return list_out
-
 
-class StructureGroupBuilder(Builder):
     def __init__(
         self,
-        materials: MongoStore,
-        sgroups: MongoStore,
-        working_ion: str,
+        grouped_materials: MongoStore,
+        thermo: MongoStore,
+        target: MongoStore,
         query: dict = None,
-        ltol: float = 0.2,
-        stol: float = 0.3,
-        angle_tol: float = 5.0,
-        check_newer: bool = True,
         **kwargs,
     ):
         """
-        Aggregate materials entries into sgroups that are topotactically similar to each other.
-        This is an incremental builder that makes ensures that each materials id belongs to one StructureGroupDoc document
+        Group ThermoDocuments together
         Args:
-            materials (Store): Store of materials documents that contains the structures
-            sgroups (Store): Store of grouped material ids
-            query (dict): dictionary to limit materials to be analyzed ---
-                            only applied to the materials when we need to group structures
-                            the phase diagram is still constructed with the entire set
-        """
-        self.materials = materials
-        self.sgroups = sgroups
-        self.working_ion = working_ion
-        self.query = query if query else {}
-        self.ltol = ltol
-        self.stol = stol
-        self.angle_tol = angle_tol
-        self.check_newer = check_newer
-        super().__init__(sources=[materials], targets=[sgroups], **kwargs)
-
-    def prechunk(self, number_splits: int) -> Iterable[Dict]:
-        """
-        TODO can implement this for distributed runs by adding filters
-        """
-        pass
-
-    def get_items(self):
-        """
-        Summary of the steps:
-        - query the materials database for different chemical systems that satisfies the base query
-          "contains redox element and working ion"
-        - Get the full chemsys list of interest
-        - The main loop is over all these chemsys.  within the main loop:
-            - get newest timestamp for the material documents (max_mat_time)
-            - get the oldest timestamp for the target documents (min_target_time)
-            - if min_target_time is < max_mat_time then nuke all the target documents
+            grouped_materials:
+            thermo: The thermo collection, documents are retrieved with the "material_ids" field
+            target: The target collection the key from the grouped_materials collection is mapped directly here
+            query: The query to be performed on the grouped_materials collection
+            **kwargs:
         """
-        other_wions = list(set(WORKING_IONS) - {self.working_ion})
-        # All potentially interesting chemsys must contain the working ion
-        base_query = {
-            "$and": [
-                self.query.copy(),
-                {"elements": {"$in": REDOX_ELEMENTS}},
-                {"elements": {"$in": [self.working_ion]}},
-                {"elements": {"$nin": other_wions}},
-            ]
-        }
-        self.logger.debug(f"Initial Chemsys QUERY: {base_query}")
-
-        # get a chemsys that only contains the working ion since the working ion
-        # must be present for there to be voltage steps
-        all_chemsys = self.materials.distinct("chemsys", criteria=base_query)
-        # Contains the working ion but not ONLY the working ion
-        all_chemsys = [
-            *filter(
-                lambda x: self.working_ion in x and len(x) > 1,
-                [chemsys_.split("-") for chemsys_ in all_chemsys],
-            )
-        ]
-
-        self.logger.debug(
-            f"Performing initial checks on {len(all_chemsys)} chemical systems containing redox elements with or without the Working Ion."
-        )
-        self.total = len(all_chemsys)
-
-        for chemsys_l in all_chemsys:
-            chemsys = "-".join(sorted(chemsys_l))
-            chemsys_wo = "-".join(sorted(set(chemsys_l) - {self.working_ion}))
-            chemsys_query = {
-                "$and": [
-                    {"chemsys": {"$in": [chemsys_wo, chemsys]}},
-                    self.query.copy(),
-                ]
-            }
-            self.logger.debug(f"QUERY: {chemsys_query}")
-            all_mats_in_chemsys = list(
-                self.materials.query(
-                    criteria=chemsys_query,
-                    properties=MAT_PROPS + [self.materials.last_updated_field],
-                )
-            )
-            self.logger.debug(
-                f"Found {len(all_mats_in_chemsys)} materials in {chemsys_wo}"
-            )
-            if self.check_newer:
-                all_target_docs = list(
-                    self.sgroups.query(
-                        criteria={"chemsys": chemsys},
-                        properties=[
-                            "group_id",
-                            self.sgroups.last_updated_field,
-                            "material_ids",
-                        ],
-                    )
-                )
-                self.logger.debug(
-                    f"Found {len(all_target_docs)} Grouped documents in {chemsys_wo}"
-                )
-
-                mat_times = [
-                    mat_doc[self.materials.last_updated_field]
-                    for mat_doc in all_mats_in_chemsys
-                ]
-                max_mat_time = max(mat_times, default=datetime.min)
-                self.logger.debug(
-                    f"The newest material doc was generated at {max_mat_time}."
-                )
-
-                target_times = [
-                    g_doc[self.materials.last_updated_field]
-                    for g_doc in all_target_docs
-                ]
-                min_target_time = min(target_times, default=datetime.max)
-                self.logger.debug(
-                    f"The newest GROUP doc was generated at {min_target_time}."
-                )
-
-                mat_ids = set(
-                    [mat_doc["material_id"] for mat_doc in all_mats_in_chemsys]
-                )
-
-                # If any material id is missing or if any material id has been updated
-                target_ids = set()
-                for g_doc in all_target_docs:
-                    target_ids |= set(g_doc["material_ids"])
-
-                self.logger.debug(
-                    f"There are {len(mat_ids)} material ids in the source database vs {len(target_ids)} in the target database."
-                )
-                if mat_ids == target_ids and max_mat_time < min_target_time:
-                    self.logger.info(f"Skipping chemsys {chemsys}.")
-                    yield None
-                elif len(target_ids) == 0:
-                    self.logger.info(
-                        f"No documents in chemsys {chemsys} in the target database."
-                    )
-                    yield {"chemsys": chemsys, "materials": all_mats_in_chemsys}
-                else:
-                    self.logger.info(
-                        f"Nuking all {len(target_ids)} documents in chemsys {chemsys} in the target database."
-                    )
-                    self._remove_targets(list(target_ids))
-                    yield {"chemsys": chemsys, "materials": all_mats_in_chemsys}
-            else:
-                yield {"chemsys": chemsys, "materials": all_mats_in_chemsys}
-
-    def update_targets(self, items: List):
-        items = list(filter(None, chain.from_iterable(items)))
-        if len(items) > 0:
-            self.logger.info("Updating {} sgroups documents".format(len(items)))
-            for struct_group_dict in items:
-                struct_group_dict[self.sgroups.last_updated_field] = datetime.utcnow()
-            self.sgroups.update(docs=items, key=["group_id"])
-        else:
-            self.logger.info("No items to update")
-
-    def _entry_from_mat_doc(self, mdoc):
-        # Note since we are just structure grouping we don't need to be careful with energy or correction
-        # All of the energy analysis is left to other builders
-        d_ = {
-            "entry_id": mdoc["material_id"],
-            "structure": mdoc["structure"],
-            "energy": -math.inf,
-            "correction": -math.inf,
-        }
-        return ComputedStructureEntry.from_dict(d_)
-
-    def process_item(self, item: Any) -> Any:
-        if item is None:
-            return None
-        entries = [*map(self._entry_from_mat_doc, item["materials"])]
-        s_groups = StructureGroupDoc.from_ungrouped_structure_entries(
-            entries=entries,
-            ignored_species=[self.working_ion],
-            ltol=self.ltol,
-            stol=self.stol,
-            angle_tol=self.angle_tol,
-        )
-        return [sg.dict() for sg in s_groups]
-
-    def _remove_targets(self, rm_ids):
-        self.sgroups.remove_docs({"material_ids": {"$in": rm_ids}})
-
-
-class InsertionElectrodeBuilder(Builder):
-    def __init__(
-        self,
-        grouped_materials: MongoStore,
-        thermo: MongoStore,
-        insertion_electrode: MongoStore,
-        query: dict = None,
-        **kwargs,
-    ):
         self.grouped_materials = grouped_materials
-        self.insertion_electrode = insertion_electrode
         self.thermo = thermo
+        self.target = target
         self.query = query if query else {}
 
         super().__init__(
             sources=[self.grouped_materials, self.thermo],
-            targets=[self.insertion_electrode],
+            targets=[self.target],
             **kwargs,
         )
 
     def get_items(self):
         """
-        Get items
+        Retrieve the thermo documents
         """
 
-        @lru_cache(1000)
-        def get_working_ion_entry(working_ion):
-            with self.thermo as store:
-                working_ion_docs = [*store.query({"chemsys": working_ion})]
-            best_wion = min(working_ion_docs, key=lambda x: x["energy_per_atom"])
-            return best_wion
-
         def get_thermo_docs(mat_ids):
             self.logger.debug(
                 f"Looking for {len(mat_ids)} material_id in the Thermo DB."
@@ -344,42 +95,56 @@ def get_thermo_docs(mat_ids):
                     "validated for the materials builder."
                 )
                 return None
-
-            # if len(item["ignored_species"]) != 1:
-            #     raise ValueError(
-            #         "Insertion electrode can only be defined for one working ion species"
-            #     )
-
             return thermo_docs
-            # return {
-            #     "group_id": item["group_id"],
-            #     "working_ion_doc": working_ion_doc,
-            #     "working_ion": item["ignored_species"][0],
-            #     "thermo_docs": thermo_docs,
-            # }
 
         q_ = {"$and": [self.query, {"has_distinct_compositions": True}]}
         self.total = self.grouped_materials.count(q_)
         for group_doc in self.grouped_materials.query(q_):
-            working_ion_doc = get_working_ion_entry(group_doc["ignored_species"][0])
-            thermo_docs = get_thermo_docs(group_doc["material_ids"])
-            if thermo_docs:
-                yield {
-                    "group_id": group_doc["group_id"],
-                    "working_ion_doc": working_ion_doc,
-                    "working_ion": group_doc["ignored_species"][0],
-                    "thermo_docs": thermo_docs,
-                }
-            else:
-                yield None
+            group_doc["thermo_docs"] = get_thermo_docs(group_doc["material_ids"])
+            yield group_doc
+
+    def process_item(self, item) -> Dict:
+        return item
+
+    def update_targets(self, items: List):
+        items = list(filter(None, items))
+        if len(items) > 0:
+            self.logger.info("Updating {} documents".format(len(items)))
+            for struct_group_dict in items:
+                struct_group_dict[
+                    self.grouped_materials.last_updated_field
+                ] = datetime.utcnow()
+            self.target.update(docs=items, key=self.grouped_materials.key)
+        else:
+            self.logger.info("No items to update")
+
+
+class InsertionElectrodeBuilder(GroupedThermoDocsBuilder):
+    def get_items(self):
+        """
+        Get items
+        """
+
+        @lru_cache(1000)
+        def get_working_ion_entry(working_ion):
+            with self.thermo as store:
+                working_ion_docs = [*store.query({"chemsys": working_ion})]
+            best_wion = min(working_ion_docs, key=lambda x: x["energy_per_atom"])
+            return best_wion
+
+        for item in super().get_items():
+            item["working_ion_doc"] = get_working_ion_entry(item["ignored_species"][0])
+            item["working_ion"] = item["ignored_species"][0]
+            yield item
 
     def process_item(self, item) -> Dict:
         """
         - Add volume information to each entry to create the insertion electrode document
         - Add the host structure
         """
-        if item is None:
+        if item["thermo_docs"] is None:
             return None
+
         self.logger.debug(
             f"Working on {item['group_id']} with {len(item['thermo_docs'])}"
         )
@@ -431,6 +196,6 @@ def update_targets(self, items: List):
             self.logger.info("No items to update")
 
 
-# class MigrationGraphBuilder(InsertionElectrodeBuilder):
-#     def process_item(self, item) -> Dict:
-#         pass
+class MigrationGraphBuilder(InsertionElectrodeBuilder):
+    def process_item(self, item) -> Dict:
+        pass
diff --git a/emmet-builders/emmet/builders/materials/structure_group.py b/emmet-builders/emmet/builders/materials/structure_group.py
new file mode 100644
index 0000000000..6d29cab259
--- /dev/null
+++ b/emmet-builders/emmet/builders/materials/structure_group.py
@@ -0,0 +1,280 @@
+import math
+import operator
+from collections import namedtuple
+from datetime import datetime
+from functools import lru_cache
+from itertools import chain, groupby
+from pprint import pprint
+from typing import Any, Dict, Iterable, List
+
+from maggma.builders import Builder, MapBuilder
+from maggma.stores import MongoStore
+from monty.json import MontyEncoder
+from numpy import unique
+from pymatgen.analysis.structure_matcher import ElementComparator, StructureMatcher
+from pymatgen.apps.battery.insertion_battery import InsertionElectrode
+from pymatgen.core import Composition, Structure
+from pymatgen.entries.computed_entries import ComputedEntry, ComputedStructureEntry
+
+from emmet.core.electrode import InsertionElectrodeDoc
+from emmet.core.structure_group import StructureGroupDoc
+from emmet.core.utils import jsanitize
+
+__author__ = "Jimmy Shen"
+__email__ = "jmmshn@lbl.gov"
+
+
+def s_hash(el):
+    return el.data["comp_delith"]
+
+
+# MatDoc = namedtuple("MatDoc", ["material_id", "structure", "formula_pretty", "framework"])
+
+REDOX_ELEMENTS = [
+    "Ti",
+    "V",
+    "Cr",
+    "Mn",
+    "Fe",
+    "Co",
+    "Ni",
+    "Cu",
+    "Nb",
+    "Mo",
+    "Sn",
+    "Sb",
+    "W",
+    "Re",
+    "Bi",
+    "C",
+    "Hf",
+]
+
+WORKING_IONS = ["Li", "Be", "Na", "Mg", "K", "Ca", "Rb", "Sr", "Cs", "Ba"]
+
+MAT_PROPS = ["structure", "material_id", "formula_pretty", "entries"]
+
+sg_fields = ["number", "hall_number", "international", "hall", "choice"]
+
+
+def generic_groupby(list_in, comp=operator.eq):
+    """
+    Group a list of unsortable objects
+    Args:
+        list_in: A list of generic objects
+        comp: (Default value = operator.eq) The comparator
+    Returns:
+        [int] list of labels for the input list
+    """
+    list_out = [None] * len(list_in)
+    label_num = 0
+    for i1, ls1 in enumerate(list_out):
+        if ls1 is not None:
+            continue
+        list_out[i1] = label_num
+        for i2, ls2 in list(enumerate(list_out))[i1 + 1 :]:
+            if comp(list_in[i1], list_in[i2]):
+                if list_out[i2] is None:
+                    list_out[i2] = list_out[i1]
+                else:
+                    list_out[i1] = list_out[i2]
+                    label_num -= 1
+        label_num += 1
+    return list_out
+
+
+class StructureGroupBuilder(Builder):
+    def __init__(
+        self,
+        materials: MongoStore,
+        sgroups: MongoStore,
+        working_ion: str,
+        query: dict = None,
+        ltol: float = 0.2,
+        stol: float = 0.3,
+        angle_tol: float = 5.0,
+        check_newer: bool = True,
+        **kwargs,
+    ):
+        """
+        Aggregate materials entries into sgroups that are topotactically similar to each other.
+        This is an incremental builder that makes ensures that each materials id belongs to one StructureGroupDoc document
+        Args:
+            materials (Store): Store of materials documents that contains the structures
+            sgroups (Store): Store of grouped material ids
+            query (dict): dictionary to limit materials to be analyzed ---
+                            only applied to the materials when we need to group structures
+                            the phase diagram is still constructed with the entire set
+        """
+        self.materials = materials
+        self.sgroups = sgroups
+        self.working_ion = working_ion
+        self.query = query if query else {}
+        self.ltol = ltol
+        self.stol = stol
+        self.angle_tol = angle_tol
+        self.check_newer = check_newer
+        super().__init__(sources=[materials], targets=[sgroups], **kwargs)
+
+    def prechunk(self, number_splits: int) -> Iterable[Dict]:
+        """
+        TODO can implement this for distributed runs by adding filters
+        """
+        pass
+
+    def get_items(self):
+        """
+        Summary of the steps:
+        - query the materials database for different chemical systems that satisfies the base query
+          "contains redox element and working ion"
+        - Get the full chemsys list of interest
+        - The main loop is over all these chemsys.  within the main loop:
+            - get newest timestamp for the material documents (max_mat_time)
+            - get the oldest timestamp for the target documents (min_target_time)
+            - if min_target_time is < max_mat_time then nuke all the target documents
+        """
+        other_wions = list(set(WORKING_IONS) - {self.working_ion})
+        # All potentially interesting chemsys must contain the working ion
+        base_query = {
+            "$and": [
+                self.query.copy(),
+                {"elements": {"$in": REDOX_ELEMENTS}},
+                {"elements": {"$in": [self.working_ion]}},
+                {"elements": {"$nin": other_wions}},
+            ]
+        }
+        self.logger.debug(f"Initial Chemsys QUERY: {base_query}")
+
+        # get a chemsys that only contains the working ion since the working ion
+        # must be present for there to be voltage steps
+        all_chemsys = self.materials.distinct("chemsys", criteria=base_query)
+        # Contains the working ion but not ONLY the working ion
+        all_chemsys = [
+            *filter(
+                lambda x: self.working_ion in x and len(x) > 1,
+                [chemsys_.split("-") for chemsys_ in all_chemsys],
+            )
+        ]
+
+        self.logger.debug(
+            f"Performing initial checks on {len(all_chemsys)} chemical systems containing redox elements with or without the Working Ion."
+        )
+        self.total = len(all_chemsys)
+
+        for chemsys_l in all_chemsys:
+            chemsys = "-".join(sorted(chemsys_l))
+            chemsys_wo = "-".join(sorted(set(chemsys_l) - {self.working_ion}))
+            chemsys_query = {
+                "$and": [
+                    {"chemsys": {"$in": [chemsys_wo, chemsys]}},
+                    self.query.copy(),
+                ]
+            }
+            self.logger.debug(f"QUERY: {chemsys_query}")
+            all_mats_in_chemsys = list(
+                self.materials.query(
+                    criteria=chemsys_query,
+                    properties=MAT_PROPS + [self.materials.last_updated_field],
+                )
+            )
+            self.logger.debug(
+                f"Found {len(all_mats_in_chemsys)} materials in {chemsys_wo}"
+            )
+            if self.check_newer:
+                all_target_docs = list(
+                    self.sgroups.query(
+                        criteria={"chemsys": chemsys},
+                        properties=[
+                            "group_id",
+                            self.sgroups.last_updated_field,
+                            "material_ids",
+                        ],
+                    )
+                )
+                self.logger.debug(
+                    f"Found {len(all_target_docs)} Grouped documents in {chemsys_wo}"
+                )
+
+                mat_times = [
+                    mat_doc[self.materials.last_updated_field]
+                    for mat_doc in all_mats_in_chemsys
+                ]
+                max_mat_time = max(mat_times, default=datetime.min)
+                self.logger.debug(
+                    f"The newest material doc was generated at {max_mat_time}."
+                )
+
+                target_times = [
+                    g_doc[self.materials.last_updated_field]
+                    for g_doc in all_target_docs
+                ]
+                min_target_time = min(target_times, default=datetime.max)
+                self.logger.debug(
+                    f"The newest GROUP doc was generated at {min_target_time}."
+                )
+
+                mat_ids = set(
+                    [mat_doc["material_id"] for mat_doc in all_mats_in_chemsys]
+                )
+
+                # If any material id is missing or if any material id has been updated
+                target_ids = set()
+                for g_doc in all_target_docs:
+                    target_ids |= set(g_doc["material_ids"])
+
+                self.logger.debug(
+                    f"There are {len(mat_ids)} material ids in the source database vs {len(target_ids)} in the target database."
+                )
+                if mat_ids == target_ids and max_mat_time < min_target_time:
+                    self.logger.info(f"Skipping chemsys {chemsys}.")
+                    yield None
+                elif len(target_ids) == 0:
+                    self.logger.info(
+                        f"No documents in chemsys {chemsys} in the target database."
+                    )
+                    yield {"chemsys": chemsys, "materials": all_mats_in_chemsys}
+                else:
+                    self.logger.info(
+                        f"Nuking all {len(target_ids)} documents in chemsys {chemsys} in the target database."
+                    )
+                    self._remove_targets(list(target_ids))
+                    yield {"chemsys": chemsys, "materials": all_mats_in_chemsys}
+            else:
+                yield {"chemsys": chemsys, "materials": all_mats_in_chemsys}
+
+    def update_targets(self, items: List):
+        items = list(filter(None, chain.from_iterable(items)))
+        if len(items) > 0:
+            self.logger.info("Updating {} sgroups documents".format(len(items)))
+            for struct_group_dict in items:
+                struct_group_dict[self.sgroups.last_updated_field] = datetime.utcnow()
+            self.sgroups.update(docs=items, key=["group_id"])
+        else:
+            self.logger.info("No items to update")
+
+    def _entry_from_mat_doc(self, mdoc):
+        # Note since we are just structure grouping we don't need to be careful with energy or correction
+        # All of the energy analysis is left to other builders
+        d_ = {
+            "entry_id": mdoc["material_id"],
+            "structure": mdoc["structure"],
+            "energy": -math.inf,
+            "correction": -math.inf,
+        }
+        return ComputedStructureEntry.from_dict(d_)
+
+    def process_item(self, item: Any) -> Any:
+        if item is None:
+            return None
+        entries = [*map(self._entry_from_mat_doc, item["materials"])]
+        s_groups = StructureGroupDoc.from_ungrouped_structure_entries(
+            entries=entries,
+            ignored_species=[self.working_ion],
+            ltol=self.ltol,
+            stol=self.stol,
+            angle_tol=self.angle_tol,
+        )
+        return [sg.dict() for sg in s_groups]
+
+    def _remove_targets(self, rm_ids):
+        self.sgroups.remove_docs({"material_ids": {"$in": rm_ids}})

From 9cfac18b2b8ec4c08f598defd00693acc3ef74cc Mon Sep 17 00:00:00 2001
From: jmmshn <jmmshn@gmail.com>
Date: Wed, 14 Apr 2021 12:46:25 -0700
Subject: [PATCH 5/9] lint

---
 emmet-builders/emmet/builders/materials/electrodes.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/emmet-builders/emmet/builders/materials/electrodes.py b/emmet-builders/emmet/builders/materials/electrodes.py
index 95c01ac148..8e7ec22e62 100644
--- a/emmet-builders/emmet/builders/materials/electrodes.py
+++ b/emmet-builders/emmet/builders/materials/electrodes.py
@@ -5,7 +5,7 @@
 from functools import lru_cache
 from itertools import chain, groupby
 from pprint import pprint
-from typing import Any, Dict, Iterable, List
+from typing import Any, Dict, Iterable, List, Union
 
 from maggma.builders import Builder, MapBuilder
 from maggma.stores import MongoStore
@@ -137,7 +137,7 @@ def get_working_ion_entry(working_ion):
             item["working_ion"] = item["ignored_species"][0]
             yield item
 
-    def process_item(self, item) -> Dict:
+    def process_item(self, item) -> Union[Dict, None]:
         """
         - Add volume information to each entry to create the insertion electrode document
         - Add the host structure

From 7da376459bd2cef3cb1fbf33dcd35115fc1d1ed7 Mon Sep 17 00:00:00 2001
From: Jimmy Shen <jmmshn@gmail.com>
Date: Tue, 4 May 2021 15:44:17 -0700
Subject: [PATCH 6/9] added pymatgen-diffusion

---
 emmet-core/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/emmet-core/requirements.txt b/emmet-core/requirements.txt
index 055f4ec076..b9f6328bc7 100644
--- a/emmet-core/requirements.txt
+++ b/emmet-core/requirements.txt
@@ -1,4 +1,5 @@
 pymatgen==2022.0.5
+pymatgen-analysis-diffusion==2021.4.29
 https://github.com/materialsvirtuallab/monty/archive/8d67c335bd5d8bb71ecc8ac732c82a53e0def4a1.zip
 pydantic==1.8.1
 pybtex==0.24.0

From 0527c19c7852559ea08de276769f9ee00063558e Mon Sep 17 00:00:00 2001
From: Jimmy Shen <jmmshn@gmail.com>
Date: Tue, 4 May 2021 15:44:45 -0700
Subject: [PATCH 7/9] typo

---
 emmet-core/emmet/core/migration.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/emmet-core/emmet/core/migration.py b/emmet-core/emmet/core/migration.py
index a7a13ce262..9ecf91577e 100644
--- a/emmet-core/emmet/core/migration.py
+++ b/emmet-core/emmet/core/migration.py
@@ -5,7 +5,6 @@
 from itertools import groupby
 from typing import Iterable, List, Union
 
-from docutils.nodes import Element
 from monty.json import MontyDecoder
 from pydantic import BaseModel, Field, validator
 from pymatgen.analysis.diffusion.neb.full_path_mapper import MigrationGraph

From 7dfba174fd422563497ba8abfa1577ccf09f1112 Mon Sep 17 00:00:00 2001
From: Jimmy Shen <jmmshn@gmail.com>
Date: Tue, 4 May 2021 15:48:34 -0700
Subject: [PATCH 8/9] Moved working ion and redox atoms to settings

---
 .../builders/materials/structure_group.py     | 30 +++---------------
 emmet-builders/emmet/builders/settings.py     | 31 +++++++++++++++++++
 2 files changed, 36 insertions(+), 25 deletions(-)

diff --git a/emmet-builders/emmet/builders/materials/structure_group.py b/emmet-builders/emmet/builders/materials/structure_group.py
index 6d29cab259..6734a0d719 100644
--- a/emmet-builders/emmet/builders/materials/structure_group.py
+++ b/emmet-builders/emmet/builders/materials/structure_group.py
@@ -5,7 +5,7 @@
 from functools import lru_cache
 from itertools import chain, groupby
 from pprint import pprint
-from typing import Any, Dict, Iterable, List
+from typing import Any, Dict, Iterable, List, Optional
 
 from maggma.builders import Builder, MapBuilder
 from maggma.stores import MongoStore
@@ -30,28 +30,6 @@ def s_hash(el):
 
 # MatDoc = namedtuple("MatDoc", ["material_id", "structure", "formula_pretty", "framework"])
 
-REDOX_ELEMENTS = [
-    "Ti",
-    "V",
-    "Cr",
-    "Mn",
-    "Fe",
-    "Co",
-    "Ni",
-    "Cu",
-    "Nb",
-    "Mo",
-    "Sn",
-    "Sb",
-    "W",
-    "Re",
-    "Bi",
-    "C",
-    "Hf",
-]
-
-WORKING_IONS = ["Li", "Be", "Na", "Mg", "K", "Ca", "Rb", "Sr", "Cs", "Ba"]
-
 MAT_PROPS = ["structure", "material_id", "formula_pretty", "entries"]
 
 sg_fields = ["number", "hall_number", "international", "hall", "choice"]
@@ -94,6 +72,7 @@ def __init__(
         stol: float = 0.3,
         angle_tol: float = 5.0,
         check_newer: bool = True,
+        settings: Optional[EmmetBuildSettings] = None,
         **kwargs,
     ):
         """
@@ -114,6 +93,7 @@ def __init__(
         self.stol = stol
         self.angle_tol = angle_tol
         self.check_newer = check_newer
+        self.settings = EmmetBuildSettings.autoload(settings)
         super().__init__(sources=[materials], targets=[sgroups], **kwargs)
 
     def prechunk(self, number_splits: int) -> Iterable[Dict]:
@@ -133,12 +113,12 @@ def get_items(self):
             - get the oldest timestamp for the target documents (min_target_time)
             - if min_target_time is < max_mat_time then nuke all the target documents
         """
-        other_wions = list(set(WORKING_IONS) - {self.working_ion})
+        other_wions = list(set(self.settings.SGROUP_WORKING_IONS) - {self.working_ion})
         # All potentially interesting chemsys must contain the working ion
         base_query = {
             "$and": [
                 self.query.copy(),
-                {"elements": {"$in": REDOX_ELEMENTS}},
+                {"elements": {"$in": self.settings.SGROUP_REDOX_ELEMENTS}},
                 {"elements": {"$in": [self.working_ion]}},
                 {"elements": {"$nin": other_wions}},
             ]
diff --git a/emmet-builders/emmet/builders/settings.py b/emmet-builders/emmet/builders/settings.py
index 72aa1cc6d1..70d92fcade 100644
--- a/emmet-builders/emmet/builders/settings.py
+++ b/emmet-builders/emmet/builders/settings.py
@@ -32,6 +32,36 @@ class EmmetBuildSettings(EmmetSettings):
         description="Allowed task_types to build materials from",
     )
 
+    SGROUP_WORKING_IONS: List[str] = Field(
+        ["Li", "Be", "Na", "Mg", "K", "Ca", "Rb", "Sr", "Cs", "Ba"],
+        description="Working ions names, all groups will consist of formulas "
+                    "with and without the working ion."
+    )
+
+    SGROUP_REDOX_ELEMENTS: List[str] = Field(
+        [
+            "Ti",
+            "V",
+            "Cr",
+            "Mn",
+            "Fe",
+            "Co",
+            "Ni",
+            "Cu",
+            "Nb",
+            "Mo",
+            "Sn",
+            "Sb",
+            "W",
+            "Re",
+            "Bi",
+            "C",
+            "Hf",
+        ],
+        description="Redox element names, all groups will consist of formulas "
+                    "that must contain redox element."
+    )
+
     DEFAULT_REFERENCE: str = Field(
         "@article{Jain2013,\nauthor = {Jain, Anubhav and Ong, Shyue Ping and "
         "Hautier, Geoffroy and Chen, Wei and Richards, William Davidson and "
@@ -59,3 +89,4 @@ class EmmetBuildSettings(EmmetSettings):
         ),
         description="Default History for provenance ",
     )
+

From 65cc9f381057b4432d938fd95d8d583ce65b4eff Mon Sep 17 00:00:00 2001
From: Jimmy Shen <jmmshn@gmail.com>
Date: Tue, 4 May 2021 16:00:00 -0700
Subject: [PATCH 9/9] added process_item for migration graph builder

---
 .../emmet/builders/materials/electrodes.py    | 31 +++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/emmet-builders/emmet/builders/materials/electrodes.py b/emmet-builders/emmet/builders/materials/electrodes.py
index 8e7ec22e62..9fcf809cd7 100644
--- a/emmet-builders/emmet/builders/materials/electrodes.py
+++ b/emmet-builders/emmet/builders/materials/electrodes.py
@@ -18,6 +18,7 @@
 
 from emmet.core.electrode import InsertionElectrodeDoc
 from emmet.core.structure_group import StructureGroupDoc
+from emmet.core.migration import MigrationGraphDoc
 from emmet.core.utils import jsanitize
 
 __author__ = "Jimmy Shen"
@@ -122,7 +123,9 @@ def update_targets(self, items: List):
 class InsertionElectrodeBuilder(GroupedThermoDocsBuilder):
     def get_items(self):
         """
-        Get items
+        Additional fields:
+            - working_ion: the name of the working ion
+            - working_ion_doc: the materials document for the working ion
         """
 
         @lru_cache(1000)
@@ -198,4 +201,28 @@ def update_targets(self, items: List):
 
 class MigrationGraphBuilder(InsertionElectrodeBuilder):
     def process_item(self, item) -> Dict:
-        pass
+        if item["thermo_docs"] is None:
+            return None
+
+        self.logger.debug(
+            f"Working on {item['group_id']} with {len(item['thermo_docs'])}"
+        )
+
+        entries = [
+            tdoc_["entries"][tdoc_["energy_type"]] for tdoc_ in item["thermo_docs"]
+        ]
+        entries = list(map(ComputedStructureEntry.from_dict, entries))
+
+        working_ion_entry = ComputedEntry.from_dict(
+            item["working_ion_doc"]["entries"][item["working_ion_doc"]["energy_type"]]
+        )
+        working_ion = working_ion_entry.composition.reduced_formula
+
+        struct = MigrationGraph.get_structure_from_entries(entries=entries, migrating_ion_entry=working_ion_entry)
+
+        mg_doc = MigrationGraphDoc.from_entries(entries=entries, working_ion_entry=working_ion_entry, ltol=item['ltol'],
+                                       stol=item['stol'],
+                                       angle_tol=item['angle_tol'],
+                                       symprec=item['symprec'])
+        d = mg_doc.dict()
+        return jsanitize(d)