Creating Diagrams

Diagrams package

This site uses the diagrams python package to draw all diagrams as code.

from IPython.display import Markdown

diagram_code = ““”

flowchart LR
  %% ============== ORIGINAL DENSE BLOCK ==============
  subgraph OD["Original dense block"]
    direction LR

    OD_LN1["Layer norm"]
    OD_ATT["Attention"]
    OD_ADD1(["⊕"])
    OD_LN2["Layer norm"]
    OD_MLP["MLP"]
    OD_ADD2(["⊕"])

    %% forward path
    OD_LN1 --> OD_ATT --> OD_ADD1 --> OD_LN2 --> OD_MLP --> OD_ADD2

    %% residuals
    OD_LN1 -. residual .-> OD_ADD1
    OD_LN2 -. residual .-> OD_ADD2
  end

  %% ============== UPCYCLED MoE BLOCK ==============
  subgraph UM["Upcycled MoE block"]
    direction LR

    U_LN1["Layer norm"]
    U_ATT["Attention"]
    U_ADD1(["⊕"])
    U_LN2["Layer norm"]

    %% ---- MoE subgraph ----
    subgraph UMOE["MoE"]
      direction TB

      ROUTER["Router from scratch"]

      subgraph EXP["Experts"]
        direction LR
        EXP_MLP1["MLP 1"]
        EXP_MLP2["MLP 2"]
        EXP_MLPE["MLP E"]
      end

      WS["Weighted Sum"]

      %% router to experts
      ROUTER --> EXP_MLP1
      ROUTER --> EXP_MLP2
      ROUTER --> EXP_MLPE

      %% experts to weighted sum
      EXP_MLP1 --> WS
      EXP_MLP2 --> WS
      EXP_MLPE --> WS
    end

    U_ADD2(["⊕"])

    %% forward path through Upcycled MoE block
    U_LN1 --> U_ATT --> U_ADD1 --> U_LN2 --> ROUTER
    WS --> U_ADD2

    %% residuals
    U_LN1 -. residual .-> U_ADD1
    U_LN2 -. residual .-> U_ADD2
  end

  %% ============== WEIGHT COPYING CONNECTIONS ==============
  %% Copy weights: layernorm/attention/layernorm
  OD_LN1 -. "Copy weights" .-> U_LN1
  OD_ATT -. "Copy weights" .-> U_ATT
  OD_LN2 -. "Copy weights" .-> U_LN2

  %% Make E MLP copies
  MAKE_COPIES["Make E MLP copies"]
  OD_MLP -.-> MAKE_COPIES
  MAKE_COPIES -. "Make copies" .-> EXP_MLP1
  MAKE_COPIES -. "Make copies" .-> EXP_MLP2
  MAKE_COPIES -. "Make copies" .-> EXP_MLPE

  %% ============== CLASSES / STYLING ==============
  classDef block fill:#ffffff,stroke:#222,stroke-width:1px,rx:5px,ry:5px;
  classDef moe fill:#e6f2d8,stroke:#88aa66,stroke-width:1px,rx:8px,ry:8px;
  classDef exp fill:#f4ecff,stroke:#9a7acc,stroke-width:1px,rx:6px,ry:6px;
  classDef add fill:#ffffff,stroke:#222,stroke-width:1px;

  class OD_LN1,OD_ATT,OD_LN2,OD_MLP,U_LN1,U_ATT,U_LN2,WS,ROUTER,MAKE_COPIES,EXP_MLP1,EXP_MLP2,EXP_MLPE block;
  class UMOE moe;
  class EXP exp;
  class OD_ADD1,OD_ADD2,U_ADD1,U_ADD2 add;

““”

Markdown(diagram_code)

# pip install diagrams
# Graphviz (dot) must be installed and on PATH.

from diagrams import Diagram, Cluster, Edge, Node


# Small helper for residual adders (⊕)
def adder(name=""):
    label = "⊕" if not name else f"⊕\n{name}"
    return Node(
        label,
        shape="circle",
        width="0.35",
        height="0.35",
        fixedsize="true",
        fontsize="14",
        style="filled",
        fillcolor="white",
    )


with Diagram(
    "Upcycled MoE block",
    filename="upcycled_moe_block_full",
    direction="LR",  # left→right main flow
    show=False,
    outformat="png",
    node_attr={
        "shape": "box",
        "style": "rounded,filled",
        "fillcolor": "white",
        "fontsize": "10",
    },
    edge_attr={
        "arrowsize": "0.7",
        "fontsize": "9",
    },
):
    # ---------------- Original dense block ----------------
    with Cluster("Original dense block"):
        o_ln1 = Node("Layer\nnorm")
        o_attn = Node("Attention")
        o_add1 = adder()  # residual after Attention
        o_ln2 = Node("Layer\nnorm")
        o_add2 = adder()  # residual after MLP
        o_mlp = Node("MLP")

        # main forward path with residuals
        o_ln1 >> o_attn >> o_add1 >> o_ln2 >> o_mlp >> o_add2

        # residual skip connections inside original block
        o_ln1 >> Edge(arrowhead="none") >> o_add1
        o_ln2 >> Edge(arrowhead="none") >> o_add2

    # ---------------- Upcycled MoE block ----------------
    with Cluster("Upcycled MoE block"):
        u_ln1 = Node("Layer\nnorm")
        u_attn = Node("Attention")
        u_add1 = adder()
        u_ln2 = Node("Layer\nnorm")

        # MoE sub-block (replacing dense MLP)
        with Cluster("MoE"):
            router = Node("Router\nfrom scratch")

            with Cluster("Experts"):
                e_mlp1 = Node("MLP 1")
                e_mlp2 = Node("MLP 2")
                e_mlpE = Node("MLP E")

            weighted_sum = Node("Weighted\nSum")

            # router → experts
            router >> e_mlp1
            router >> e_mlp2
            router >> e_mlpE

            # experts → weighted sum
            e_mlp1 >> weighted_sum
            e_mlp2 >> weighted_sum
            e_mlpE >> weighted_sum

        u_add2 = adder()  # residual after MoE

        # forward path through upcycled block
        u_ln1 >> u_attn >> u_add1 >> u_ln2 >> router
        weighted_sum >> u_add2

        # residual skips inside upcycled block
        u_ln1 >> Edge(arrowhead="none") >> u_add1
        u_ln2 >> Edge(arrowhead="none") >> u_add2

    # ---------------- Dashed “copy weights” arrows ----------------
    # layernorm / attention / layernorm weights copied
    o_ln1 >> Edge(style="dashed", label="Copy weights") >> u_ln1
    o_attn >> Edge(style="dashed", label="Copy weights") >> u_attn
    o_ln2 >> Edge(style="dashed", label="Copy weights") >> u_ln2

    # MLP → experts: “Make E MLP copies”
    make_copies = Node("Make E\nMLP copies")
    o_mlp >> Edge(style="dashed") >> make_copies
    make_copies >> Edge(style="dashed") >> e_mlp1
    make_copies >> Edge(style="dashed") >> e_mlp2
    make_copies >> Edge(style="dashed") >> e_mlpE

SW Architecture

from diagrams import Diagram
from diagrams.c4 import Person, Container, Database, System, SystemBoundary, Relationship

graph_attr = {
    "splines": "spline",
}

with Diagram(
    "Container diagram for Internet Banking System", direction="TB", graph_attr=graph_attr, show=False
) as diag:
    customer = Person(
        name="Personal Banking Customer", description="A customer of the bank, with personal bank accounts."
    )

    with SystemBoundary("Internet Banking System"):
        webapp = Container(
            name="Web Application",
            technology="Java and Spring MVC",
            description="Delivers the static content and the Internet banking single page application.",
        )

        spa = Container(
            name="Single-Page Application",
            technology="Javascript and Angular",
            description="Provides all of the Internet banking functionality to customers via their web browser.",
        )

        mobileapp = Container(
            name="Mobile App",
            technology="Xamarin",
            description="Provides a limited subset of the Internet banking functionality to customers via their mobile device.",
        )

        api = Container(
            name="API Application",
            technology="Java and Spring MVC",
            description="Provides Internet banking functionality via a JSON/HTTPS API.",
        )

        database = Database(
            name="Database",
            technology="Oracle Database Schema",
            description="Stores user registration information, hashed authentication credentials, access logs, etc.",
        )

    email = System(name="E-mail System", description="The internal Microsoft Exchange e-mail system.", external=True)

    mainframe = System(
        name="Mainframe Banking System",
        description="Stores all of the core banking information about customers, accounts, transactions, etc.",
        external=True,
    )

    customer >> Relationship("Visits bigbank.com/ib using [HTTPS]") >> webapp
    customer >> Relationship("Views account balances, and makes payments using") >> [spa, mobileapp]
    webapp >> Relationship("Delivers to the customer's web browser") >> spa
    spa >> Relationship("Make API calls to [JSON/HTTPS]") >> api
    mobileapp >> Relationship("Make API calls to [JSON/HTTPS]") >> api

    api >> Relationship("reads from and writes to") >> database
    api >> Relationship("Sends email using [SMTP]") >> email
    api >> Relationship("Makes API calls to [XML/HTTPS]") >> mainframe
    customer << Relationship("Sends e-mails to") << email
diag

AWS Diagrams

from diagrams import Cluster, Diagram
from diagrams.aws.compute import ECS
from diagrams.aws.database import ElastiCache, RDS
from diagrams.aws.network import ELB
from diagrams.aws.network import Route53

with Diagram("Clustered Web Services", show=False) as diag:
    dns = Route53("dns")
    lb = ELB("lb")

    with Cluster("Services"):
        svc_group = [ECS("web1"), ECS("web2"), ECS("web3")]

    with Cluster("DB Cluster"):
        db_primary = RDS("userdb")
        db_primary - [RDS("userdb ro")]

    memcached = ElastiCache("memcached")

    dns >> lb >> svc_group
    svc_group >> db_primary
    svc_group >> memcached
diag

Diagrams with custom icons

from diagrams import Diagram, Cluster
from diagrams.custom import Custom
from urllib.request import urlretrieve

with Diagram("Custom with remote icons", show=False, filename="custom_remote", direction="LR") as diag:
    # download the icon image file
    diagrams_url = "https://github.com/mingrammer/diagrams/raw/master/assets/img/diagrams.png"
    diagrams_icon = "diagrams.png"
    urlretrieve(diagrams_url, diagrams_icon)

    diagrams = Custom("Diagrams", diagrams_icon)

    with Cluster("Some Providers"):
        openstack_url = "https://github.com/mingrammer/diagrams/raw/master/resources/openstack/openstack.png"
        openstack_icon = "openstack.png"
        urlretrieve(openstack_url, openstack_icon)

        openstack = Custom("OpenStack", openstack_icon)

        elastic_url = "https://github.com/mingrammer/diagrams/raw/master/resources/elastic/saas/elastic.png"
        elastic_icon = "elastic.png"
        urlretrieve(elastic_url, elastic_icon)

        elastic = Custom("Elastic", elastic_icon)

    diagrams >> openstack
    diagrams >> elastic
diag

Model Architectures

We use Neutron.app to visualize model architectures.