diff --git a/.gitignore b/.gitignore
old mode 100755
new mode 100644
index c702daf..1cc3778
--- a/.gitignore
+++ b/.gitignore
@@ -138,13 +138,16 @@ dmypy.json
 .pyre/
 
 .idea/
-weights/
 *.pth
 *.onnx
 
 .vscode/
 output/
 datasets/
-weights/
 vendor/vendor/
 vendor/
+log2.md
+vendor/
+.DS_Store
+wandb/
+artifacts/
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..ffcbe76
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,674 @@
+GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    YOLOv7  Copyright (C) 2022  Lucas Jin
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<http://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<http://www.gnu.org/philosophy/why-not-lgpl.html>.
diff --git a/configs/Base-YOLOF.yaml b/configs/Base-YOLOF.yaml
old mode 100755
new mode 100644
diff --git a/configs/Base-YOLOv7.yaml b/configs/Base-YOLOv7.yaml
old mode 100755
new mode 100644
index 8de4de3..542f258
--- a/configs/Base-YOLOv7.yaml
+++ b/configs/Base-YOLOv7.yaml
@@ -1,5 +1,5 @@
 MODEL:
-  META_ARCHITECTURE: "YOLO" # default is YOLO, can be YOLOV7, YOLOX, YOLOMASK as well
+  META_ARCHITECTURE: "YOLOV7" # default is YOLO, can be YOLOV7, YOLOX, YOLOMASK as well
   PIXEL_MEAN: [0.406, 0.485, 0.456] # same value as PP-YOLOv2, BGR order
   PIXEL_STD: [0.225, 0.229, 0.224]
   PADDED_VALUE: 114.0
diff --git a/configs/canaries/detrt_256_6_6_regnetx_0.4g.yaml b/configs/canaries/detrt_256_6_6_regnetx_0.4g.yaml
new file mode 100644
index 0000000..9c1bf7e
--- /dev/null
+++ b/configs/canaries/detrt_256_6_6_regnetx_0.4g.yaml
@@ -0,0 +1,57 @@
+MODEL:
+  META_ARCHITECTURE: "Detr"
+  WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  MASK_ON: False
+
+  BACKBONE:
+    NAME: "build_regnet_backbone"
+  REGNETS:
+    TYPE: "RegNetX_400MF"
+    OUT_FEATURES: ["s2", "s3", "s4"] # fpn produce 4 levels, only using 3 for now
+  # RESNETS:
+  #   DEPTH: 50
+  #   STRIDE_IN_1X1: False
+  #   OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+  DETR:
+    GIOU_WEIGHT: 2.0
+    L1_WEIGHT: 5.0
+    NUM_OBJECT_QUERIES: 100
+    ENC_LAYERS: 6
+    DEC_LAYERS: 6
+    HIDDEN_DIM: 256
+
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+
+SOLVER:
+  IMS_PER_BATCH: 28
+  BASE_LR: 0.00005
+  STEPS: (369600,)
+  MAX_ITER: 554400
+  WARMUP_FACTOR: 1.0
+  WARMUP_ITERS: 10
+  WEIGHT_DECAY: 0.0001
+  OPTIMIZER: "ADAMW"
+  BACKBONE_MULTIPLIER: 0.1
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 0.01
+    NORM_TYPE: 2.0
+INPUT:
+  MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
+  CROP:
+    ENABLED: True
+    TYPE: "absolute_range"
+    SIZE: (384, 600)
+  FORMAT: "RGB"
+TEST:
+  EVAL_PERIOD: 4000
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: False
+  NUM_WORKERS: 2
+VERSION: 2
+OUTPUT_DIR: "output/coco_detr_regx"
\ No newline at end of file
diff --git a/configs/canaries/detrt_256_6_6_torchvision.yaml b/configs/canaries/detrt_256_6_6_torchvision.yaml
old mode 100755
new mode 100644
diff --git a/configs/canaries/regnetx_0.2g.yaml b/configs/canaries/regnetx_0.2g.yaml
old mode 100755
new mode 100644
diff --git a/configs/canaries/yolomask_2gpu.yaml b/configs/canaries/yolomask_2gpu.yaml
old mode 100755
new mode 100644
diff --git a/configs/canaries/yolomask_m_8gpu.yaml b/configs/canaries/yolomask_m_8gpu.yaml
old mode 100755
new mode 100644
diff --git a/configs/coco-instance/solov2_lite.yaml b/configs/coco-instance/solov2_lite.yaml
old mode 100755
new mode 100644
diff --git a/configs/coco-instance/yolomask.yaml b/configs/coco-instance/yolomask.yaml
old mode 100755
new mode 100644
diff --git a/configs/coco-instance/yolomask_8gpu.yaml b/configs/coco-instance/yolomask_8gpu.yaml
old mode 100755
new mode 100644
diff --git a/configs/coco-keypoints/yolox_kpts.yaml b/configs/coco-keypoints/yolox_kpts.yaml
new file mode 100644
index 0000000..f114523
--- /dev/null
+++ b/configs/coco-keypoints/yolox_kpts.yaml
@@ -0,0 +1,85 @@
+_BASE_: "../Base-YOLOv7.yaml"
+MODEL:
+  PIXEL_MEAN: [0.485, 0.456, 0.406] # same value as PP-YOLOv2, RGB order
+  PIXEL_STD: [0.229, 0.224, 0.225]
+
+  WEIGHTS: ""
+  KEYPOINT_ON: True
+  META_ARCHITECTURE: "YOLOX"
+  BACKBONE:
+    NAME: "build_cspdarknetx_backbone"
+
+  DARKNET:
+    WEIGHTS: ""
+    DEPTH_WISE: False
+    OUT_FEATURES: ["dark3", "dark4", "dark5"]
+
+  YOLO:
+    CLASSES: 80
+    IN_FEATURES: ["dark3", "dark4", "dark5"]
+    CONF_THRESHOLD: 0.001
+    NMS_THRESHOLD: 0.65
+    IGNORE_THRESHOLD: 0.7
+    WIDTH_MUL: 0.50
+    DEPTH_MUL: 0.33
+    LOSS_TYPE: "v7"
+    LOSS:
+      LAMBDA_IOU: 1.5
+
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  # TEST: ("coco_2014_val_mini",)
+  TEST: ("coco_2017_val",)
+
+INPUT:
+  # FORMAT: "RGB" # using BGR default
+  MIN_SIZE_TRAIN: (416, 512, 608, 768)
+  MAX_SIZE_TRAIN: 800 # force max size train to 800?
+  MIN_SIZE_TEST: 640
+  MAX_SIZE_TEST: 800
+  # open all augmentations
+  JITTER_CROP:
+    ENABLED: False
+  RESIZE:
+    ENABLED: False
+    # SHAPE: (540, 960)
+  DISTORTION:
+    ENABLED: True
+  COLOR_JITTER:
+    BRIGHTNESS: True
+    SATURATION: True
+  # MOSAIC:
+  #   ENABLED: True
+  #   NUM_IMAGES: 4
+  #   DEBUG_VIS: True
+  #   # MOSAIC_WIDTH: 960
+  #   # MOSAIC_HEIGHT: 540
+  MOSAIC_AND_MIXUP:
+    ENABLED: True
+    # ENABLED: False
+    DEBUG_VIS: False
+    ENABLE_MIXUP: False
+    DISABLE_AT_ITER: 120000
+
+
+SOLVER:
+  # enable fp16 training
+  AMP:
+    ENABLED: true
+  IMS_PER_BATCH: 112
+  BASE_LR: 0.027
+  STEPS: (60000, 80000)
+  WARMUP_FACTOR: 0.00033333
+  WARMUP_ITERS: 1200
+  MAX_ITER: 230000
+  LR_SCHEDULER_NAME: "WarmupCosineLR"
+
+TEST:
+  EVAL_PERIOD: 10000
+  # EVAL_PERIOD: 0
+OUTPUT_DIR: "output/coco_yolox_s_kpts"
+VIS_PERIOD: 5000
+
+DATALOADER:
+  # proposals are part of the dataset_dicts, and take a lot of RAM
+  NUM_WORKERS: 3
diff --git a/configs/coco/cspdarknet53.yaml b/configs/coco/cspdarknet53.yaml
old mode 100755
new mode 100644
diff --git a/configs/coco/darknet53.yaml b/configs/coco/darknet53.yaml
old mode 100755
new mode 100644
index 59de5b9..6c558ca
--- a/configs/coco/darknet53.yaml
+++ b/configs/coco/darknet53.yaml
@@ -1,4 +1,4 @@
-_BASE_: "../Base-YoloV7.yaml"
+_BASE_: "../Base-YOLOv7.yaml"
 MODEL:
   WEIGHTS: ""
   MASK_ON: False
@@ -23,6 +23,9 @@ MODEL:
     CONF_THRESHOLD: 0.2
     NMS_THRESHOLD: 0.1
     IGNORE_THRESHOLD: 0.6
+    NECK:
+      TYPE: "fpn"
+      WITH_SPP: True
 DATASETS:
   TRAIN: ("coco_2017_train",)
   TEST: ("coco_2017_val",)
diff --git a/configs/coco/detr/anchordetr_origin.yaml b/configs/coco/detr/anchordetr_origin.yaml
new file mode 100644
index 0000000..c522aa4
--- /dev/null
+++ b/configs/coco/detr/anchordetr_origin.yaml
@@ -0,0 +1,67 @@
+MODEL:
+  META_ARCHITECTURE: "AnchorDetr"
+  WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
+  # WEIGHTS: "weights/R-50.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
+    STRIDE_IN_1X1: False
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+  DETR:
+    GIOU_WEIGHT: 2.0
+    L1_WEIGHT: 5.0
+    NUM_QUERY_POSITION: 300
+    DIM_FEEDFORWARD: 1024
+    DROPOUT: 0.0
+    ENC_LAYERS: 6
+    DEC_LAYERS: 6
+    HIDDEN_DIM: 256
+    NUM_CLASSES: 80
+  YOLO:
+    CONF_THRESHOLD: 0.001
+    IGNORE_THRESHOLD: 0.07
+
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+
+SOLVER:
+  AMP:
+    ENABLED: true
+  IMS_PER_BATCH: 16
+  # BASE_LR: 0.0002 # 0.00025 is better
+  BASE_LR: 0.0001 # 0.00025 is better
+  STEPS: (295720, )
+  # MAX_ITER: 369650
+  MAX_ITER: 409650
+  WARMUP_FACTOR: 1.0
+  WARMUP_ITERS: 10
+  WEIGHT_DECAY: 0.0001
+  OPTIMIZER: "ADAMW"
+  LR_MULTIPLIER_OVERWRITE:
+    [{ "backbone": 0.1 }, { "reference_points": 0.1, "sampling_offsets": 0.1 }]
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    # CLIP_TYPE: "norm"
+    CLIP_VALUE: 0.1
+    NORM_TYPE: 2.0
+INPUT:
+  MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
+  CROP:
+    ENABLED: True
+    TYPE: "absolute_range"
+    SIZE: (384, 600)
+    # SIZE: (384, 632)
+  FORMAT: "RGB"
+TEST:
+  EVAL_PERIOD: 7393 # 1 epoch same as bs=2
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: False
+  NUM_WORKERS: 2
+VERSION: 2
+
+VIS_PERIOD: 100
+OUTPUT_DIR: "output/coco_anchordetr"
diff --git a/configs/coco/detr/anchordetr_origin_bs64.yaml b/configs/coco/detr/anchordetr_origin_bs64.yaml
new file mode 100644
index 0000000..229e323
--- /dev/null
+++ b/configs/coco/detr/anchordetr_origin_bs64.yaml
@@ -0,0 +1,68 @@
+MODEL:
+  META_ARCHITECTURE: "AnchorDetr"
+  WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
+  # WEIGHTS: "weights/R-50.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
+    STRIDE_IN_1X1: False
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+  DETR:
+    GIOU_WEIGHT: 2.0
+    L1_WEIGHT: 5.0
+    NUM_QUERY_POSITION: 300
+    DIM_FEEDFORWARD: 1024
+    DROPOUT: 0.0
+    ENC_LAYERS: 6
+    DEC_LAYERS: 6
+    HIDDEN_DIM: 256
+    NUM_CLASSES: 80
+  YOLO:
+    CONF_THRESHOLD: 0.001
+    IGNORE_THRESHOLD: 0.07
+
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+
+SOLVER:
+  AMP:
+    ENABLED: true
+  IMS_PER_BATCH: 64
+  # BASE_LR: 0.0002 # 0.00025 is better
+  BASE_LR: 0.00025 # lr should be a little bit larger
+  # STEPS: (295720, )
+  STEPS: (73930, )
+  # MAX_ITER: 369650
+  MAX_ITER: 409650
+  WARMUP_FACTOR: 1.0
+  WARMUP_ITERS: 10
+  WEIGHT_DECAY: 0.0001
+  OPTIMIZER: "ADAMW"
+  LR_MULTIPLIER_OVERWRITE:
+    [{ "backbone": 0.1 }, { "reference_points": 0.1, "sampling_offsets": 0.1 }]
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    # CLIP_TYPE: "norm"
+    CLIP_VALUE: 0.1
+    NORM_TYPE: 2.0
+INPUT:
+  MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
+  CROP:
+    ENABLED: True
+    TYPE: "absolute_range"
+    SIZE: (384, 600)
+    # SIZE: (384, 632)
+  FORMAT: "RGB"
+TEST:
+  EVAL_PERIOD: 7393 # 1 epoch same as bs=2
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: False
+  NUM_WORKERS: 2
+VERSION: 2
+
+VIS_PERIOD: 100
+OUTPUT_DIR: "output/coco_anchordetr"
diff --git a/configs/coco/detr/d2go/detr_bs16.yaml b/configs/coco/detr/d2go/detr_bs16.yaml
new file mode 100644
index 0000000..748bfb5
--- /dev/null
+++ b/configs/coco/detr/d2go/detr_bs16.yaml
@@ -0,0 +1,62 @@
+MODEL:
+  META_ARCHITECTURE: "DetrD2go"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
+    STRIDE_IN_1X1: False
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+  DETR:
+    NUM_CLASSES: 80
+    CLS_WEIGHT: 2.0
+    DIM_FEEDFORWARD: 1024
+    GIOU_WEIGHT: 2.0
+    L1_WEIGHT: 5.0
+    NUM_OBJECT_QUERIES: 300
+    CENTERED_POSITION_ENCODIND: True
+    USE_FOCAL_LOSS: True
+    NUM_FEATURE_LEVELS: 1
+    ATTENTION_TYPE: 'DETR'
+
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+
+SOLVER:
+  AMP:
+    ENABLED: true
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.0001
+  STEPS: (887040,)
+  MAX_ITER: 1108800
+  WARMUP_FACTOR: 1.0
+  WARMUP_ITERS: 10
+  WEIGHT_DECAY: 0.0001
+  OPTIMIZER: "ADAMW"
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 0.1
+    NORM_TYPE: 2.0
+  LR_MULTIPLIER_OVERWRITE: [{'backbone': 0.1}, {'reference_points': 0.1, 'sampling_offsets': 0.1}]
+  # BACKBONE_MULTIPLIER: 0.1
+  
+INPUT:
+  MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
+  CROP:
+    ENABLED: True
+    TYPE: "absolute_range"
+    SIZE: (384, 600)
+  FORMAT: "RGB"
+# D2GO_DATA:
+#   MAPPER:
+#     NAME: "DETRDatasetMapper"
+
+TEST:
+  EVAL_PERIOD: 4000
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: False
+  NUM_WORKERS: 4
+VERSION: 2
+
diff --git a/configs/coco/detr/d2go/detr_fbv3_bs16.yaml b/configs/coco/detr/d2go/detr_fbv3_bs16.yaml
new file mode 100644
index 0000000..94e2501
--- /dev/null
+++ b/configs/coco/detr/d2go/detr_fbv3_bs16.yaml
@@ -0,0 +1,61 @@
+MODEL:
+  META_ARCHITECTURE: "DetrD2go"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  MASK_ON: False
+  BACKBONE:
+    NAME: "FBNetV2C4Backbone"
+  FBNET_V2:
+    ARCH: "FBNetV3_A_dsmask_C5"
+    NORM: "sync_bn"
+    WIDTH_DIVISOR: 8
+    SCALE_FACTOR: 1.0
+    OUT_FEATURES: ["trunk4"]
+  DETR:
+    NUM_CLASSES: 80
+    CLS_WEIGHT: 2.0
+    DIM_FEEDFORWARD: 1024
+    GIOU_WEIGHT: 2.0
+    L1_WEIGHT: 5.0
+    NUM_OBJECT_QUERIES: 300
+    CENTERED_POSITION_ENCODIND: True
+    USE_FOCAL_LOSS: True
+    NUM_FEATURE_LEVELS: 1
+    ATTENTION_TYPE: 'DETR'
+
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.0002
+  STEPS: (887040,)
+  MAX_ITER: 1108800
+  WARMUP_FACTOR: 1.0
+  WARMUP_ITERS: 10
+  WEIGHT_DECAY: 0.0001
+  OPTIMIZER: "ADAMW"
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 0.1
+    NORM_TYPE: 2.0
+  LR_MULTIPLIER_OVERWRITE: [{'backbone': 0.1}, {'reference_points': 0.1, 'sampling_offsets': 0.1}]
+  
+INPUT:
+  MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
+  CROP:
+    ENABLED: True
+    TYPE: "absolute_range"
+    SIZE: (384, 600)
+  FORMAT: "RGB"
+D2GO_DATA:
+  MAPPER:
+    NAME: "DETRDatasetMapper"
+TEST:
+  EVAL_PERIOD: 4000
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: False
+  NUM_WORKERS: 4
+VERSION: 2
+
diff --git a/configs/coco/detr/d2go/smca_bs16.yaml b/configs/coco/detr/d2go/smca_bs16.yaml
new file mode 100644
index 0000000..125b8c6
--- /dev/null
+++ b/configs/coco/detr/d2go/smca_bs16.yaml
@@ -0,0 +1,66 @@
+MODEL:
+  META_ARCHITECTURE: "DetrD2go"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  MASK_ON: False
+  WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+    STRIDE_IN_1X1: False
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+
+  DETR:
+    NUM_CLASSES: 80
+    CLS_WEIGHT: 2.0
+    DIM_FEEDFORWARD: 2048 # 1024 -> 2048
+    GIOU_WEIGHT: 2.0
+    L1_WEIGHT: 5.0
+    NUM_OBJECT_QUERIES: 300
+    CENTERED_POSITION_ENCODIND: True
+    USE_FOCAL_LOSS: True
+    NUM_FEATURE_LEVELS: 1
+    ATTENTION_TYPE: "SMCA"
+
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+
+SOLVER:
+  AMP:
+    ENABLED: true
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.0001
+  STEPS: (295720,)
+  # MAX_ITER: 369650
+  MAX_ITER: 429650
+  WARMUP_FACTOR: 1.0
+  WARMUP_ITERS: 10
+  WEIGHT_DECAY: 0.0001
+  OPTIMIZER: "ADAMW"
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 0.1
+    NORM_TYPE: 2.0
+  LR_MULTIPLIER_OVERWRITE:
+    [{ "backbone": 0.1 }, { "reference_points": 0.1, "sampling_offsets": 0.1 }]
+
+INPUT:
+  MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
+  CROP:
+    ENABLED: True
+    TYPE: "absolute_range"
+    SIZE: (384, 600)
+  FORMAT: "RGB"
+# D2GO_DATA:
+#   MAPPER:
+#     NAME: "DETRDatasetMapper"
+
+TEST:
+  EVAL_PERIOD: 4000
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: False
+  NUM_WORKERS: 4
+VERSION: 2
+
+OUTPUT_DIR: "output/coco_smcadetr_d2go"
diff --git a/configs/coco/detr/d2go/smca_bs64.yaml b/configs/coco/detr/d2go/smca_bs64.yaml
new file mode 100644
index 0000000..30f7bd8
--- /dev/null
+++ b/configs/coco/detr/d2go/smca_bs64.yaml
@@ -0,0 +1,68 @@
+MODEL:
+  META_ARCHITECTURE: "DetrD2go"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  MASK_ON: False
+  WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
+  RESNETS:
+    DEPTH: 50
+    STRIDE_IN_1X1: False
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+
+  DETR:
+    NUM_CLASSES: 80
+    CLS_WEIGHT: 2.0
+    DIM_FEEDFORWARD: 2048 # 1024 -> 2048
+    GIOU_WEIGHT: 2.0
+    L1_WEIGHT: 5.0
+    NUM_OBJECT_QUERIES: 300
+    CENTERED_POSITION_ENCODIND: True
+    USE_FOCAL_LOSS: True
+    NUM_FEATURE_LEVELS: 1
+    ATTENTION_TYPE: "SMCA"
+
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+
+SOLVER:
+  AMP:
+    ENABLED: true
+  IMS_PER_BATCH: 64
+  BASE_LR: 0.00016
+  # STEPS: (295720,)
+  # MAX_ITER: 369650
+  STEPS: (73930, )
+  # MAX_ITER: 369650
+  MAX_ITER: 140000 # 14w we can get a 41 AP
+  WARMUP_FACTOR: 1.0
+  WARMUP_ITERS: 10
+  WEIGHT_DECAY: 0.0001
+  OPTIMIZER: "ADAMW"
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 0.1
+    NORM_TYPE: 2.0
+  LR_MULTIPLIER_OVERWRITE:
+    [{ "backbone": 0.1 }, { "reference_points": 0.1, "sampling_offsets": 0.1 }]
+
+INPUT:
+  MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
+  CROP:
+    ENABLED: True
+    TYPE: "absolute_range"
+    SIZE: (384, 600)
+  FORMAT: "RGB"
+# D2GO_DATA:
+#   MAPPER:
+#     NAME: "DETRDatasetMapper"
+
+TEST:
+  EVAL_PERIOD: 4000
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: False
+  NUM_WORKERS: 4
+VERSION: 2
+
+OUTPUT_DIR: "output/coco_smcadetr_d2go"
diff --git a/configs/coco/detr/d2go/smca_fbv3.yaml b/configs/coco/detr/d2go/smca_fbv3.yaml
new file mode 100644
index 0000000..deee367
--- /dev/null
+++ b/configs/coco/detr/d2go/smca_fbv3.yaml
@@ -0,0 +1,77 @@
+MODEL:
+  META_ARCHITECTURE: "DetrD2go"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  MASK_ON: False
+  BACKBONE:
+    NAME: "FBNetV2C4Backbone"
+  FBNET_V2:
+    ARCH: "FBNetV3_A_dsmask_C5"
+    NORM: "sync_bn"
+    WIDTH_DIVISOR: 8
+    SCALE_FACTOR: 1.0
+    OUT_FEATURES: ["trunk4"]
+
+  DETR:
+    NUM_CLASSES: 80
+    CLS_WEIGHT: 2.0
+    DIM_FEEDFORWARD: 2048 # 1024 -> 2048
+    GIOU_WEIGHT: 2.0
+    L1_WEIGHT: 5.0
+    NUM_OBJECT_QUERIES: 300
+    CENTERED_POSITION_ENCODIND: True
+    USE_FOCAL_LOSS: True
+    NUM_FEATURE_LEVELS: 1
+    ATTENTION_TYPE: "SMCA"
+
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+
+SOLVER:
+  AMP:
+    ENABLED: true
+  IMS_PER_BATCH: 64
+  BASE_LR: 0.0001
+  # STEPS: (295720,)
+  # MAX_ITER: 369650
+  STEPS: (73930, )
+  # MAX_ITER: 369650
+  MAX_ITER: 429650
+  WARMUP_FACTOR: 1.0
+  WARMUP_ITERS: 10
+  WEIGHT_DECAY: 0.0001
+  OPTIMIZER: "ADAMW"
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 0.1
+    NORM_TYPE: 2.0
+  LR_MULTIPLIER_OVERWRITE:
+    [{ "backbone": 1.2 }, { "reference_points": 0.1, "sampling_offsets": 0.1 }]
+
+# 0.00012 0.1 25.17,23.09,21.96
+# 0.00012 0.9 24,   22.29, 21.88
+# 0.00019 0.9 25.32,23.81, 23.46
+# 0.0001  0.9 24.37, 22.35, 21.57, 21.02, 20.76
+# 0.0001  1.2 23.83, 22.09, 
+
+INPUT:
+  MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
+  CROP:
+    ENABLED: True
+    TYPE: "absolute_range"
+    SIZE: (384, 600)
+  FORMAT: "RGB"
+# D2GO_DATA:
+#   MAPPER:
+#     NAME: "DETRDatasetMapper"
+
+TEST:
+  EVAL_PERIOD: 4000
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: False
+  NUM_WORKERS: 4
+VERSION: 2
+
+OUTPUT_DIR: "output/coco_smcadetr_d2go_fbv3"
diff --git a/configs/coco/detr/d2go/smca_regnetx_0.4g.yaml b/configs/coco/detr/d2go/smca_regnetx_0.4g.yaml
new file mode 100644
index 0000000..f6b301d
--- /dev/null
+++ b/configs/coco/detr/d2go/smca_regnetx_0.4g.yaml
@@ -0,0 +1,77 @@
+MODEL:
+  META_ARCHITECTURE: "DetrD2go"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  MASK_ON: False
+  WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
+  BACKBONE:
+    NAME: "build_regnet_backbone"
+    SIMPLE: true
+    STRIDE: 32
+    CHANNEL: 384
+  REGNETS:
+    TYPE: "RegNetX_400MF"
+    OUT_FEATURES: ["s2", "s3", "s4"] # fpn produce 4 levels, only using 3 for now
+
+  DETR:
+    NUM_CLASSES: 80
+    CLS_WEIGHT: 2.0
+    DIM_FEEDFORWARD: 2048 # 1024 -> 2048
+    GIOU_WEIGHT: 2.0
+    L1_WEIGHT: 5.0
+    NUM_OBJECT_QUERIES: 300
+    CENTERED_POSITION_ENCODIND: True
+    USE_FOCAL_LOSS: True
+    NUM_FEATURE_LEVELS: 1
+    ATTENTION_TYPE: "SMCA"
+
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+
+SOLVER:
+  AMP:
+    ENABLED: true
+  IMS_PER_BATCH: 64
+  BASE_LR: 0.0001
+  # STEPS: (295720,)
+  # MAX_ITER: 369650
+  STEPS: (73930, )
+  # MAX_ITER: 369650
+  MAX_ITER: 429650
+  WARMUP_FACTOR: 1.0
+  WARMUP_ITERS: 10
+  WEIGHT_DECAY: 0.0001
+  OPTIMIZER: "ADAMW"
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 0.1
+    NORM_TYPE: 2.0
+  LR_MULTIPLIER_OVERWRITE:
+    [{ "backbone": 0.6}, { "reference_points": 0.1, "sampling_offsets": 0.1 }]
+
+# 0.00013, 0.1, 29.9, 24.86, 23.57
+# 0.00013, 0.9, 27.26,23.95
+# 0.0001, 0.9, 26.38, 23.74, 23.01, 22.52
+# 0.0001, 1.2, 29.43
+
+INPUT:
+  MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
+  CROP:
+    ENABLED: True
+    TYPE: "absolute_range"
+    SIZE: (384, 600)
+  FORMAT: "RGB"
+# D2GO_DATA:
+#   MAPPER:
+#     NAME: "DETRDatasetMapper"
+
+TEST:
+  EVAL_PERIOD: 4000
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: False
+  NUM_WORKERS: 4
+VERSION: 2
+
+OUTPUT_DIR: "output/coco_smcadetr_d2go_regnetx"
diff --git a/configs/coco/detr/detrt_256_6_6_regnetx_0.4g.yaml b/configs/coco/detr/detr_256_6_6_regnetx_0.4g.yaml
old mode 100755
new mode 100644
similarity index 94%
rename from configs/coco/detr/detrt_256_6_6_regnetx_0.4g.yaml
rename to configs/coco/detr/detr_256_6_6_regnetx_0.4g.yaml
index 9bc28c0..fb254d1
--- a/configs/coco/detr/detrt_256_6_6_regnetx_0.4g.yaml
+++ b/configs/coco/detr/detr_256_6_6_regnetx_0.4g.yaml
@@ -27,7 +27,7 @@ DATASETS:
   TEST: ("coco_2017_val",)
 
 SOLVER:
-  IMS_PER_BATCH: 64
+  IMS_PER_BATCH: 32
   BASE_LR: 0.0001
   STEPS: (369600,)
   MAX_ITER: 554400
@@ -38,7 +38,7 @@ SOLVER:
   BACKBONE_MULTIPLIER: 0.1
   CLIP_GRADIENTS:
     ENABLED: True
-    CLIP_TYPE: "norm"
+    CLIP_TYPE: "full_model"
     CLIP_VALUE: 0.01
     NORM_TYPE: 2.0
 INPUT:
@@ -52,6 +52,6 @@ TEST:
   EVAL_PERIOD: 4000
 DATALOADER:
   FILTER_EMPTY_ANNOTATIONS: False
-  NUM_WORKERS: 2
+  NUM_WORKERS: 1
 VERSION: 2
 OUTPUT_DIR: "output/coco_detr_regx"
\ No newline at end of file
diff --git a/configs/coco/detr/detrt_256_6_6_torchvision.yaml b/configs/coco/detr/detr_256_6_6_torchvision.yaml
old mode 100755
new mode 100644
similarity index 90%
rename from configs/coco/detr/detrt_256_6_6_torchvision.yaml
rename to configs/coco/detr/detr_256_6_6_torchvision.yaml
index 009416e..f66d095
--- a/configs/coco/detr/detrt_256_6_6_torchvision.yaml
+++ b/configs/coco/detr/detr_256_6_6_torchvision.yaml
@@ -21,6 +21,8 @@ DATASETS:
   TEST: ("coco_2017_val",)
 
 SOLVER:
+  AMP:
+    ENABLED: true
   IMS_PER_BATCH: 56
   BASE_LR: 0.0001
   STEPS: (369600,)
@@ -32,19 +34,19 @@ SOLVER:
   BACKBONE_MULTIPLIER: 0.1
   CLIP_GRADIENTS:
     ENABLED: True
-    # CLIP_TYPE: "full_model"
-    CLIP_TYPE: "norm"
+    CLIP_TYPE: "full_model"
+    # CLIP_TYPE: "norm"
     CLIP_VALUE: 0.01
     NORM_TYPE: 2.0
 INPUT:
-  MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
+  MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832)
   CROP:
     ENABLED: True
     TYPE: "absolute_range"
     SIZE: (384, 600)
   FORMAT: "RGB"
 TEST:
-  EVAL_PERIOD: 4000
+  EVAL_PERIOD: 10000
 DATALOADER:
   FILTER_EMPTY_ANNOTATIONS: False
   NUM_WORKERS: 2
diff --git a/configs/coco/detr/detr_256_6_6_torchvision_mask.yaml b/configs/coco/detr/detr_256_6_6_torchvision_mask.yaml
new file mode 100644
index 0000000..5fb54b8
--- /dev/null
+++ b/configs/coco/detr/detr_256_6_6_torchvision_mask.yaml
@@ -0,0 +1,58 @@
+MODEL:
+  META_ARCHITECTURE: "Detr"
+  WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  MASK_ON: True
+  RESNETS:
+    DEPTH: 50
+    STRIDE_IN_1X1: False
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+  DETR:
+    GIOU_WEIGHT: 2.0
+    L1_WEIGHT: 5.0
+    NUM_OBJECT_QUERIES: 100
+    ENC_LAYERS: 6
+    DEC_LAYERS: 6
+    HIDDEN_DIM: 256
+    NUM_CLASSES: 250
+     # hard coded for mask
+    # FROZEN_WEIGHTS: 'weights/detr_panoptic.pth'
+  YOLO:
+    CONF_THRESHOLD: 0.3
+
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+
+SOLVER:
+  IMS_PER_BATCH: 56
+  BASE_LR: 0.0001
+  STEPS: (369600,)
+  MAX_ITER: 554400
+  WARMUP_FACTOR: 1.0
+  WARMUP_ITERS: 10
+  WEIGHT_DECAY: 0.0001
+  OPTIMIZER: "ADAMW"
+  BACKBONE_MULTIPLIER: 0.1
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    # CLIP_TYPE: "norm"
+    CLIP_VALUE: 0.01
+    NORM_TYPE: 2.0
+INPUT:
+  MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
+  CROP:
+    ENABLED: True
+    TYPE: "absolute_range"
+    SIZE: (384, 600)
+  FORMAT: "RGB"
+TEST:
+  EVAL_PERIOD: 4000
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: False
+  NUM_WORKERS: 2
+VERSION: 2
+
+OUTPUT_DIR: "output/coco_detr"
\ No newline at end of file
diff --git a/configs/coco/detr/smcadetr_origin.yaml b/configs/coco/detr/smcadetr_origin.yaml
new file mode 100644
index 0000000..bbab9c2
--- /dev/null
+++ b/configs/coco/detr/smcadetr_origin.yaml
@@ -0,0 +1,77 @@
+MODEL:
+  META_ARCHITECTURE: "SMCADetr"
+  WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
+  # WEIGHTS: "weights/R-50.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  MASK_ON: False
+  RESNETS:
+    DEPTH: 50
+    STRIDE_IN_1X1: False
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+  DETR:
+    # CONF_THRESHOLD: 0.1
+    GIOU_WEIGHT: 2.0
+    L1_WEIGHT: 5.0
+    NUM_OBJECT_QUERIES: 300
+    DIM_FEEDFORWARD: 2048
+    DROPOUT: 0.1
+    ENC_LAYERS: 6
+    DEC_LAYERS: 6
+    HIDDEN_DIM: 256
+    NUM_CLASSES: 80
+    # NUM_CLASSES: 81
+    NUM_FEATURE_LEVELS: 1
+  YOLO:
+    CONF_THRESHOLD: 0.0001
+    IGNORE_THRESHOLD: 0.001
+
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val",)
+
+SOLVER:
+  AMP:
+    ENABLED: true
+  IMS_PER_BATCH: 16
+  # BASE_LR: 0.0001
+  BASE_LR: 0.0001
+  # STEPS: (369600,)
+  # STEPS: (110880, 210039)
+  # STEPS: (295720, )
+  # MAX_ITER: 369650
+  STEPS: (325720, )
+  MAX_ITER: 409650
+  # MAX_ITER: 469650
+  # MAX_ITER: 162420
+  WARMUP_FACTOR: 1.0
+  # detr bs=64 is 10, we using 40 for 16
+  WARMUP_ITERS: 10
+  WEIGHT_DECAY: 0.0001
+  OPTIMIZER: "ADAMW"
+  LR_MULTIPLIER_OVERWRITE:
+    [{ "backbone": 0.1 }, { "reference_points": 0.1, "sampling_offsets": 0.1 }]
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    # CLIP_TYPE: "norm"
+    CLIP_VALUE: 0.1
+    # NORM_TYPE: 2.0
+
+INPUT:
+  MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
+  CROP:
+    ENABLED: True
+    TYPE: "absolute_range"
+    SIZE: (384, 600)
+  FORMAT: "RGB"
+
+TEST:
+  EVAL_PERIOD: 7393
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: False
+  NUM_WORKERS: 2
+VERSION: 2
+
+VIS_PERIOD: 100
+OUTPUT_DIR: "output/coco_smcadetr_origin"
diff --git a/configs/coco/pvt_v2_b0.yaml b/configs/coco/pvt_v2_b0.yaml
old mode 100755
new mode 100644
diff --git a/configs/coco/r2_50.yaml b/configs/coco/r2_50.yaml
old mode 100755
new mode 100644
diff --git a/configs/coco/r2_50_l.yaml b/configs/coco/r2_50_l.yaml
old mode 100755
new mode 100644
diff --git a/configs/coco/r2next_50.yaml b/configs/coco/r2next_50.yaml
old mode 100755
new mode 100644
diff --git a/configs/coco/r50.yaml b/configs/coco/r50.yaml
old mode 100755
new mode 100644
diff --git a/configs/coco/regnetx_0.4g.yaml b/configs/coco/regnetx_0.4g.yaml
old mode 100755
new mode 100644
diff --git a/configs/coco/sparseinst/Base-SparseInst.yaml b/configs/coco/sparseinst/Base-SparseInst.yaml
new file mode 100644
index 0000000..8ba680f
--- /dev/null
+++ b/configs/coco/sparseinst/Base-SparseInst.yaml
@@ -0,0 +1,39 @@
+MODEL:
+  META_ARCHITECTURE: "SparseInst"
+  WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  BACKBONE:
+    FREEZE_AT: 0
+    NAME: "build_resnet_backbone"
+  RESNETS:
+    NORM: "FrozenBN"
+    DEPTH: 50
+    STRIDE_IN_1X1: False
+    OUT_FEATURES: ["res3", "res4", "res5"]
+  SPARSE_INST:
+    ENCODER:
+      NAME: "InstanceContextEncoder"
+    DECODER:
+      NAME: "GroupIAMDecoder"
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST:  ("coco_2017_val",)
+SOLVER:
+  IMS_PER_BATCH: 64
+  BASE_LR: 0.00005
+  STEPS: (210000, 250000)
+  MAX_ITER: 270000
+  WEIGHT_DECAY: 0.05
+INPUT:
+  MIN_SIZE_TRAIN: (416, 448, 480, 512, 544, 576, 608, 640)
+  MAX_SIZE_TRAIN: 853
+  MIN_SIZE_TEST: 640
+  MAX_SIZE_TEST: 853
+  FORMAT: "RGB"
+  MASK_FORMAT: "bitmask"
+TEST:
+  EVAL_PERIOD: 7330
+DATALOADER:
+  NUM_WORKERS: 6
+VERSION: 2
diff --git a/configs/coco/sparseinst/sparse_inst_r50_base.yaml b/configs/coco/sparseinst/sparse_inst_r50_base.yaml
new file mode 100644
index 0000000..6b86e58
--- /dev/null
+++ b/configs/coco/sparseinst/sparse_inst_r50_base.yaml
@@ -0,0 +1,6 @@
+_BASE_: "Base-SparseInst.yaml"
+MODEL:
+  SPARSE_INST:
+    DECODER:
+      NAME: "BaseIAMDecoder"
+OUTPUT_DIR: "output/sparse_inst_r50_base"
\ No newline at end of file
diff --git a/configs/coco/sparseinst/sparse_inst_r50_dcn_giam_aug.yaml b/configs/coco/sparseinst/sparse_inst_r50_dcn_giam_aug.yaml
new file mode 100644
index 0000000..dba9690
--- /dev/null
+++ b/configs/coco/sparseinst/sparse_inst_r50_dcn_giam_aug.yaml
@@ -0,0 +1,11 @@
+_BASE_: "Base-SparseInst.yaml"
+MODEL:
+  RESNETS:
+    DEFORM_ON_PER_STAGE: [False, False, True, True] # dcn on res4, res5
+INPUT:
+  CROP:
+    ENABLED: True
+    TYPE: "absolute_range"
+    SIZE: (384, 600)
+  MASK_FORMAT: "polygon"
+OUTPUT_DIR: "output/sparse_inst_r50_dcn_giam_aug"
\ No newline at end of file
diff --git a/configs/coco/sparseinst/sparse_inst_r50_giam.yaml b/configs/coco/sparseinst/sparse_inst_r50_giam.yaml
new file mode 100644
index 0000000..81e4bd6
--- /dev/null
+++ b/configs/coco/sparseinst/sparse_inst_r50_giam.yaml
@@ -0,0 +1,2 @@
+_BASE_: "Base-SparseInst.yaml"
+OUTPUT_DIR: "output/sparse_inst_r50_giam"
\ No newline at end of file
diff --git a/configs/coco/sparseinst/sparse_inst_r50_giam_aug.yaml b/configs/coco/sparseinst/sparse_inst_r50_giam_aug.yaml
new file mode 100644
index 0000000..b868567
--- /dev/null
+++ b/configs/coco/sparseinst/sparse_inst_r50_giam_aug.yaml
@@ -0,0 +1,8 @@
+_BASE_: "Base-SparseInst.yaml"
+INPUT:
+  CROP:
+    ENABLED: True
+    TYPE: "absolute_range"
+    SIZE: (384, 600)
+  MASK_FORMAT: "polygon"
+OUTPUT_DIR: "output/sparse_inst_r50_giam_aug"
\ No newline at end of file
diff --git a/configs/coco/sparseinst/sparse_inst_r50vd_base.yaml b/configs/coco/sparseinst/sparse_inst_r50vd_base.yaml
new file mode 100644
index 0000000..7a04418
--- /dev/null
+++ b/configs/coco/sparseinst/sparse_inst_r50vd_base.yaml
@@ -0,0 +1,11 @@
+_BASE_: "Base-SparseInst.yaml"
+MODEL:
+MODEL:
+  WEIGHTS: "../../pretrained_models/resnet50d_ra2-464e36ba.pth"
+  BACKBONE:
+    FREEZE_AT: 0
+    NAME: "build_resnet_vd_backbone"
+  SPARSE_INST:
+    DECODER:
+      NAME: "BaseIAMDecoder"
+OUTPUT_DIR: "output/sparse_inst_r50_base"
\ No newline at end of file
diff --git a/configs/coco/sparseinst/sparse_inst_r50vd_dcn_giam.yaml b/configs/coco/sparseinst/sparse_inst_r50vd_dcn_giam.yaml
new file mode 100644
index 0000000..1f904d3
--- /dev/null
+++ b/configs/coco/sparseinst/sparse_inst_r50vd_dcn_giam.yaml
@@ -0,0 +1,9 @@
+_BASE_: "Base-SparseInst.yaml"
+MODEL:
+  BACKBONE:
+    FREEZE_AT: 0
+    NAME: "build_resnet_vd_backbone"
+  RESNETS:
+    DEFORM_ON_PER_STAGE: [False, False, True, True] # dcn on res4, res5
+OUTPUT_DIR: "output/sparse_inst_r50vd_dcn_giam"
+  
diff --git a/configs/coco/sparseinst/sparse_inst_r50vd_dcn_giam_aug.yaml b/configs/coco/sparseinst/sparse_inst_r50vd_dcn_giam_aug.yaml
new file mode 100644
index 0000000..40cdbdb
--- /dev/null
+++ b/configs/coco/sparseinst/sparse_inst_r50vd_dcn_giam_aug.yaml
@@ -0,0 +1,15 @@
+_BASE_: "Base-SparseInst.yaml"
+MODEL:
+  BACKBONE:
+    FREEZE_AT: 0
+    NAME: "build_resnet_vd_backbone"
+  RESNETS:
+    DEFORM_ON_PER_STAGE: [False, False, True, True] # dcn on res4, res5
+INPUT:
+  CROP:
+    ENABLED: True
+    TYPE: "absolute_range"
+    SIZE: (384, 600)
+  MASK_FORMAT: "polygon"
+OUTPUT_DIR: "output/sparse_inst_r50vd_dcn_giam_aug"
+  
diff --git a/configs/coco/sparseinst/sparse_inst_r50vd_giam.yaml b/configs/coco/sparseinst/sparse_inst_r50vd_giam.yaml
new file mode 100644
index 0000000..e9cd71d
--- /dev/null
+++ b/configs/coco/sparseinst/sparse_inst_r50vd_giam.yaml
@@ -0,0 +1,7 @@
+_BASE_: "Base-SparseInst.yaml"
+MODEL:
+  WEIGHTS: "../../pretrained_models/resnet50d_ra2-464e36ba.pth"
+  BACKBONE:
+    FREEZE_AT: 0
+    NAME: "build_resnet_vd_backbone"
+OUTPUT_DIR: "output/sparse_inst_r50vd_giam"
diff --git a/configs/coco/sparseinst/sparse_inst_r50vd_giam_aug.yaml b/configs/coco/sparseinst/sparse_inst_r50vd_giam_aug.yaml
new file mode 100644
index 0000000..032451a
--- /dev/null
+++ b/configs/coco/sparseinst/sparse_inst_r50vd_giam_aug.yaml
@@ -0,0 +1,12 @@
+_BASE_: "Base-SparseInst.yaml"
+MODEL:
+  BACKBONE:
+    FREEZE_AT: 0
+    NAME: "build_resnet_vd_backbone"
+INPUT:
+  CROP:
+    ENABLED: True
+    TYPE: "absolute_range"
+    SIZE: (384, 600)
+  MASK_FORMAT: "polygon"
+OUTPUT_DIR: "output/sparse_inst_r50vd_giam_aug"
diff --git a/configs/coco/swin_s.yaml b/configs/coco/swin_s.yaml
old mode 100755
new mode 100644
diff --git a/configs/coco/swin_t.yaml b/configs/coco/swin_t.yaml
old mode 100755
new mode 100644
diff --git a/configs/coco/yolof/yolof_CSP_D_53_DC5_3x.yaml b/configs/coco/yolof/yolof_CSP_D_53_DC5_3x.yaml
old mode 100755
new mode 100644
diff --git a/configs/coco/yolof/yolof_R_50_DC5_1x.yaml b/configs/coco/yolof/yolof_R_50_DC5_1x.yaml
old mode 100755
new mode 100644
diff --git a/configs/coco/yolov6/yolov6_m.yaml b/configs/coco/yolov6/yolov6_m.yaml
new file mode 100644
index 0000000..2b45ee8
--- /dev/null
+++ b/configs/coco/yolov6/yolov6_m.yaml
@@ -0,0 +1,82 @@
+_BASE_: "../../Base-YOLOv7.yaml"
+MODEL:
+  PIXEL_MEAN: [0.485, 0.456, 0.406] # same value as PP-YOLOv2, RGB order
+  PIXEL_STD: [0.229, 0.224, 0.225]
+
+  WEIGHTS: ""
+  MASK_ON: False
+  META_ARCHITECTURE: "YOLOV6"
+  BACKBONE:
+    NAME: "build_efficientrep_backbone"
+    OUT_FEATURES: ["stride8", "stride16", "stride32"]
+
+  YOLO:
+    CLASSES: 80
+    IN_FEATURES: ["stride8", "stride16", "stride32"]
+    CONF_THRESHOLD: 0.001
+    NMS_THRESHOLD: 0.65
+    IGNORE_THRESHOLD: 0.7
+    WIDTH_MUL: 1.
+    DEPTH_MUL: 1.
+    LOSS:
+      LAMBDA_IOU: 1.5
+    NECK:
+      TYPE: "reppan"
+
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  # TEST: ("coco_2014_val_mini",)
+  TEST: ("coco_2017_val",)
+
+INPUT:
+  # FORMAT: "RGB" # using BGR default
+  MIN_SIZE_TRAIN: (416, 512, 608, 768)
+  MAX_SIZE_TRAIN: 800 # force max size train to 800?
+  MIN_SIZE_TEST: 640
+  MAX_SIZE_TEST: 800
+  # open all augmentations
+  JITTER_CROP:
+    ENABLED: False
+  RESIZE:
+    ENABLED: False
+    # SHAPE: (540, 960)
+  DISTORTION:
+    ENABLED: True
+  COLOR_JITTER:
+    BRIGHTNESS: True
+    SATURATION: True
+  # MOSAIC:
+  #   ENABLED: True
+  #   NUM_IMAGES: 4
+  #   DEBUG_VIS: True
+  #   # MOSAIC_WIDTH: 960
+  #   # MOSAIC_HEIGHT: 540
+  MOSAIC_AND_MIXUP:
+    ENABLED: True
+    # ENABLED: False
+    DEBUG_VIS: False
+    ENABLE_MIXUP: False
+    DISABLE_AT_ITER: 120000
+
+
+SOLVER:
+  # enable fp16 training
+  AMP:
+    ENABLED: true
+  IMS_PER_BATCH: 112
+  BASE_LR: 0.027
+  STEPS: (60000, 80000)
+  WARMUP_FACTOR: 0.00033333
+  WARMUP_ITERS: 1200
+  MAX_ITER: 230000
+  LR_SCHEDULER_NAME: "WarmupCosineLR"
+
+TEST:
+  EVAL_PERIOD: 10000
+  # EVAL_PERIOD: 0
+OUTPUT_DIR: "output/coco_yolox_s"
+VIS_PERIOD: 5000
+
+DATALOADER:
+  # proposals are part of the dataset_dicts, and take a lot of RAM
+  NUM_WORKERS: 3
diff --git a/configs/coco/yolov6/yolov6_s.yaml b/configs/coco/yolov6/yolov6_s.yaml
new file mode 100644
index 0000000..05aac32
--- /dev/null
+++ b/configs/coco/yolov6/yolov6_s.yaml
@@ -0,0 +1,84 @@
+_BASE_: "../../Base-YOLOv7.yaml"
+MODEL:
+  PIXEL_MEAN: [0.485, 0.456, 0.406] # same value as PP-YOLOv2, RGB order
+  PIXEL_STD: [0.229, 0.224, 0.225]
+
+  WEIGHTS: ""
+  MASK_ON: False
+  META_ARCHITECTURE: "YOLOV6"
+  BACKBONE:
+    NAME: "build_efficientrep_backbone"
+    OUT_FEATURES: ["stride8", "stride16", "stride32"]
+
+  YOLO:
+    CLASSES: 80
+    IN_FEATURES: ["stride8", "stride16", "stride32"]
+    CONF_THRESHOLD: 0.001
+    NMS_THRESHOLD: 0.65
+    IGNORE_THRESHOLD: 0.7
+    WIDTH_MUL: 0.50
+    DEPTH_MUL: 0.33
+    LOSS:
+      LAMBDA_IOU: 1.5
+    HEAD:
+      TYPE: "yolov6"
+    NECK:
+      TYPE: "reppan"
+
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  # TEST: ("coco_2014_val_mini",)
+  TEST: ("coco_2017_val",)
+
+INPUT:
+  # FORMAT: "RGB" # using BGR default
+  MIN_SIZE_TRAIN: (416, 512, 608, 768)
+  MAX_SIZE_TRAIN: 800 # force max size train to 800?
+  MIN_SIZE_TEST: 640
+  MAX_SIZE_TEST: 800
+  # open all augmentations
+  JITTER_CROP:
+    ENABLED: False
+  RESIZE:
+    ENABLED: False
+    # SHAPE: (540, 960)
+  DISTORTION:
+    ENABLED: True
+  COLOR_JITTER:
+    BRIGHTNESS: True
+    SATURATION: True
+  # MOSAIC:
+  #   ENABLED: True
+  #   NUM_IMAGES: 4
+  #   DEBUG_VIS: True
+  #   # MOSAIC_WIDTH: 960
+  #   # MOSAIC_HEIGHT: 540
+  MOSAIC_AND_MIXUP:
+    ENABLED: True
+    # ENABLED: False
+    DEBUG_VIS: False
+    ENABLE_MIXUP: False
+    DISABLE_AT_ITER: 120000
+
+
+SOLVER:
+  # enable fp16 training
+  AMP:
+    ENABLED: true
+  IMS_PER_BATCH: 80
+  BASE_LR: 0.02
+  STEPS: (60000, 80000)
+  WARMUP_FACTOR: 0.00033333
+  WARMUP_ITERS: 1200
+  MAX_ITER: 230000
+  LR_SCHEDULER_NAME: "WarmupCosineLR"
+
+TEST:
+  EVAL_PERIOD: 10000
+  # EVAL_PERIOD: 0
+OUTPUT_DIR: "output/coco_yolox_s"
+VIS_PERIOD: 5000
+
+DATALOADER:
+  # proposals are part of the dataset_dicts, and take a lot of RAM
+  NUM_WORKERS: 3
diff --git a/configs/coco/yolov6/yolov6_tiny.yaml b/configs/coco/yolov6/yolov6_tiny.yaml
new file mode 100644
index 0000000..3409ae1
--- /dev/null
+++ b/configs/coco/yolov6/yolov6_tiny.yaml
@@ -0,0 +1,82 @@
+_BASE_: "../../Base-YOLOv7.yaml"
+MODEL:
+  PIXEL_MEAN: [0.485, 0.456, 0.406] # same value as PP-YOLOv2, RGB order
+  PIXEL_STD: [0.229, 0.224, 0.225]
+
+  WEIGHTS: ""
+  MASK_ON: False
+  META_ARCHITECTURE: "YOLOV6"
+  BACKBONE:
+    NAME: "build_efficientrep_backbone"
+    OUT_FEATURES: ["stride8", "stride16", "stride32"]
+
+  YOLO:
+    CLASSES: 80
+    IN_FEATURES: ["stride8", "stride16", "stride32"]
+    CONF_THRESHOLD: 0.001
+    NMS_THRESHOLD: 0.65
+    IGNORE_THRESHOLD: 0.7
+    WIDTH_MUL: 0.50
+    DEPTH_MUL: 0.25
+    LOSS:
+      LAMBDA_IOU: 1.5
+    NECK:
+      TYPE: "reppan"
+
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  # TEST: ("coco_2014_val_mini",)
+  TEST: ("coco_2017_val",)
+
+INPUT:
+  # FORMAT: "RGB" # using BGR default
+  MIN_SIZE_TRAIN: (416, 512, 608, 768)
+  MAX_SIZE_TRAIN: 800 # force max size train to 800?
+  MIN_SIZE_TEST: 640
+  MAX_SIZE_TEST: 800
+  # open all augmentations
+  JITTER_CROP:
+    ENABLED: False
+  RESIZE:
+    ENABLED: False
+    # SHAPE: (540, 960)
+  DISTORTION:
+    ENABLED: True
+  COLOR_JITTER:
+    BRIGHTNESS: True
+    SATURATION: True
+  # MOSAIC:
+  #   ENABLED: True
+  #   NUM_IMAGES: 4
+  #   DEBUG_VIS: True
+  #   # MOSAIC_WIDTH: 960
+  #   # MOSAIC_HEIGHT: 540
+  MOSAIC_AND_MIXUP:
+    ENABLED: True
+    # ENABLED: False
+    DEBUG_VIS: False
+    ENABLE_MIXUP: False
+    DISABLE_AT_ITER: 120000
+
+
+SOLVER:
+  # enable fp16 training
+  AMP:
+    ENABLED: true
+  IMS_PER_BATCH: 112
+  BASE_LR: 0.027
+  STEPS: (60000, 80000)
+  WARMUP_FACTOR: 0.00033333
+  WARMUP_ITERS: 1200
+  MAX_ITER: 230000
+  LR_SCHEDULER_NAME: "WarmupCosineLR"
+
+TEST:
+  EVAL_PERIOD: 10000
+  # EVAL_PERIOD: 0
+OUTPUT_DIR: "output/coco_yolox_s"
+VIS_PERIOD: 5000
+
+DATALOADER:
+  # proposals are part of the dataset_dicts, and take a lot of RAM
+  NUM_WORKERS: 3
diff --git a/configs/coco/yolox/yolox_convnext.yaml b/configs/coco/yolox/yolox_convnext.yaml
new file mode 100644
index 0000000..a6261bd
--- /dev/null
+++ b/configs/coco/yolox/yolox_convnext.yaml
@@ -0,0 +1,86 @@
+_BASE_: "../../Base-YOLOv7.yaml"
+MODEL:
+  PIXEL_MEAN: [0.485, 0.456, 0.406] # same value as PP-YOLOv2, RGB order
+  PIXEL_STD: [0.229, 0.224, 0.225]
+
+  WEIGHTS: ""
+  MASK_ON: False
+  META_ARCHITECTURE: "YOLOX"
+  BACKBONE:
+    NAME: "build_convnext_backbone"
+
+  DARKNET:
+    WEIGHTS: ""
+    DEPTH_WISE: False
+    OUT_FEATURES: [0, 1, 2]
+
+  YOLO:
+    CLASSES: 80
+    # IN_FEATURES: ["dark3", "dark4", "dark5"]
+    IN_FEATURES: [0, 1, 2]
+    CONF_THRESHOLD: 0.001
+    NMS_THRESHOLD: 0.65
+    IGNORE_THRESHOLD: 0.7
+    WIDTH_MUL: 0.50
+    DEPTH_MUL: 0.33
+    LOSS_TYPE: "v7"
+    LOSS:
+      LAMBDA_IOU: 1.5
+
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  # TEST: ("coco_2014_val_mini",)
+  TEST: ("coco_2017_val",)
+
+INPUT:
+  # FORMAT: "RGB" # using BGR default
+  MIN_SIZE_TRAIN: (416, 512, 608, 768)
+  MAX_SIZE_TRAIN: 800 # force max size train to 800?
+  MIN_SIZE_TEST: 640
+  MAX_SIZE_TEST: 800
+  # open all augmentations
+  JITTER_CROP:
+    ENABLED: False
+  RESIZE:
+    ENABLED: False
+    # SHAPE: (540, 960)
+  DISTORTION:
+    ENABLED: True
+  COLOR_JITTER:
+    BRIGHTNESS: True
+    SATURATION: True
+  # MOSAIC:
+  #   ENABLED: True
+  #   NUM_IMAGES: 4
+  #   DEBUG_VIS: True
+  #   # MOSAIC_WIDTH: 960
+  #   # MOSAIC_HEIGHT: 540
+  MOSAIC_AND_MIXUP:
+    ENABLED: True
+    # ENABLED: False
+    DEBUG_VIS: False
+    ENABLE_MIXUP: False
+    DISABLE_AT_ITER: 120000
+
+
+SOLVER:
+  # enable fp16 training
+  AMP:
+    ENABLED: true
+  IMS_PER_BATCH: 112
+  BASE_LR: 0.027
+  STEPS: (60000, 80000)
+  WARMUP_FACTOR: 0.00033333
+  WARMUP_ITERS: 1200
+  MAX_ITER: 230000
+  LR_SCHEDULER_NAME: "WarmupCosineLR"
+
+TEST:
+  EVAL_PERIOD: 10000
+  # EVAL_PERIOD: 0
+OUTPUT_DIR: "output/coco_yolox_s"
+VIS_PERIOD: 5000
+
+DATALOADER:
+  # proposals are part of the dataset_dicts, and take a lot of RAM
+  NUM_WORKERS: 3
diff --git a/configs/coco/yolox_regnetx_s.yaml b/configs/coco/yolox_regnetx_s.yaml
new file mode 100644
index 0000000..4899867
--- /dev/null
+++ b/configs/coco/yolox_regnetx_s.yaml
@@ -0,0 +1,86 @@
+_BASE_: "../Base-YOLOv7.yaml"
+MODEL:
+  PIXEL_MEAN: [0.485, 0.456, 0.406] # same value as PP-YOLOv2, RGB order
+  PIXEL_STD: [0.229, 0.224, 0.225]
+
+  WEIGHTS: ""
+  MASK_ON: False
+  META_ARCHITECTURE: "YOLOX"
+  BACKBONE:
+    NAME: "build_regnet_backbone"
+    SIMPLE: true
+    STRIDE: 32
+    CHANNEL: 384
+  REGNETS:
+    TYPE: "RegNetX_400MF"
+    OUT_FEATURES: ["s2", "s3", "s4"] # fpn produce 4 levels, only using 3 for now
+
+  YOLO:
+    CLASSES: 80
+    IN_FEATURES: ["s2", "s3", "s4"]
+    CONF_THRESHOLD: 0.001
+    NMS_THRESHOLD: 0.65
+    IGNORE_THRESHOLD: 0.7
+    WIDTH_MUL: 0.50
+    DEPTH_MUL: 0.33
+    LOSS_TYPE: "v7"
+    LOSS:
+      LAMBDA_IOU: 1.5
+
+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  # TEST: ("coco_2014_val_mini",)
+  TEST: ("coco_2017_val",)
+
+INPUT:
+  # FORMAT: "RGB" # using BGR default
+  MIN_SIZE_TRAIN: (416, 512, 608, 768)
+  MAX_SIZE_TRAIN: 800 # force max size train to 800?
+  MIN_SIZE_TEST: 640
+  MAX_SIZE_TEST: 800
+  # open all augmentations
+  JITTER_CROP:
+    ENABLED: False
+  RESIZE:
+    ENABLED: False
+    # SHAPE: (540, 960)
+  DISTORTION:
+    ENABLED: True
+  COLOR_JITTER:
+    BRIGHTNESS: True
+    SATURATION: True
+  # MOSAIC:
+  #   ENABLED: True
+  #   NUM_IMAGES: 4
+  #   DEBUG_VIS: True
+  #   # MOSAIC_WIDTH: 960
+  #   # MOSAIC_HEIGHT: 540
+  MOSAIC_AND_MIXUP:
+    ENABLED: True
+    # ENABLED: False
+    DEBUG_VIS: False
+    ENABLE_MIXUP: False
+    DISABLE_AT_ITER: 120000
+
+
+SOLVER:
+  # enable fp16 training
+  AMP:
+    ENABLED: true
+  IMS_PER_BATCH: 112
+  BASE_LR: 0.027
+  STEPS: (60000, 80000)
+  WARMUP_FACTOR: 0.00033333
+  WARMUP_ITERS: 1200
+  MAX_ITER: 230000
+  LR_SCHEDULER_NAME: "WarmupCosineLR"
+
+TEST:
+  EVAL_PERIOD: 10000
+  # EVAL_PERIOD: 0
+OUTPUT_DIR: "output/coco_yolox_s"
+VIS_PERIOD: 5000
+
+DATALOADER:
+  # proposals are part of the dataset_dicts, and take a lot of RAM
+  NUM_WORKERS: 3
diff --git a/configs/coco/yolox_s.yaml b/configs/coco/yolox_s.yaml
old mode 100755
new mode 100644
diff --git a/configs/common/coco_schedule.py b/configs/common/coco_schedule.py
old mode 100755
new mode 100644
diff --git a/configs/common/data/coco.py b/configs/common/data/coco.py
old mode 100755
new mode 100644
diff --git a/configs/common/data/coco_keypoint.py b/configs/common/data/coco_keypoint.py
old mode 100755
new mode 100644
diff --git a/configs/common/data/coco_panoptic_separated.py b/configs/common/data/coco_panoptic_separated.py
old mode 100755
new mode 100644
diff --git a/configs/common/models/mask_rcnn_fpn.py b/configs/common/models/mask_rcnn_fpn.py
old mode 100755
new mode 100644
diff --git a/configs/common/models/mask_rcnn_fpn_kps.py b/configs/common/models/mask_rcnn_fpn_kps.py
old mode 100755
new mode 100644
diff --git a/configs/common/models/panoptic_fpn.py b/configs/common/models/panoptic_fpn.py
old mode 100755
new mode 100644
diff --git a/configs/common/optim.py b/configs/common/optim.py
old mode 100755
new mode 100644
diff --git a/configs/common/readme.md b/configs/common/readme.md
old mode 100755
new mode 100644
diff --git a/configs/common/train.py b/configs/common/train.py
old mode 100755
new mode 100644
diff --git a/configs/facemask/cspdarknet53_1gpu.yaml b/configs/facemask/cspdarknet53_1gpu.yaml
old mode 100755
new mode 100644
diff --git a/configs/facemask/r2_50_1gpu.yaml b/configs/facemask/r2_50_1gpu.yaml
old mode 100755
new mode 100644
diff --git a/configs/facemask/r50_1gpu.yaml b/configs/facemask/r50_1gpu.yaml
old mode 100755
new mode 100644
diff --git a/configs/facemask/r50_pan_1gpu.yaml b/configs/facemask/r50_pan_1gpu.yaml
old mode 100755
new mode 100644
diff --git a/configs/facemask/swin_tiny_1gpu.yaml b/configs/facemask/swin_tiny_1gpu.yaml
old mode 100755
new mode 100644
diff --git a/configs/facemask/yolov5_s.yaml b/configs/facemask/yolov5_s.yaml
old mode 100755
new mode 100644
diff --git a/configs/new_baselines/mask_rcnn_R_50_FPN_100ep_LSJ.py b/configs/new_baselines/mask_rcnn_R_50_FPN_100ep_LSJ.py
old mode 100755
new mode 100644
diff --git a/configs/new_baselines/mask_rcnn_regnetx_4gf_dds_FPN_200ep_LSJ.py b/configs/new_baselines/mask_rcnn_regnetx_4gf_dds_FPN_200ep_LSJ.py
old mode 100755
new mode 100644
diff --git a/configs/new_baselines/maskrcnn_kps_regnetx_0.4g.py b/configs/new_baselines/maskrcnn_kps_regnetx_0.4g.py
old mode 100755
new mode 100644
diff --git a/configs/new_baselines/maskrcnn_regnetx_0.4g.py b/configs/new_baselines/maskrcnn_regnetx_0.4g.py
old mode 100755
new mode 100644
diff --git a/configs/new_baselines/panoptic_fpn_regnetx_0.4g.py b/configs/new_baselines/panoptic_fpn_regnetx_0.4g.py
old mode 100755
new mode 100644
diff --git a/configs/new_baselines/panoptic_fpn_regnetx_0.4g_s.py b/configs/new_baselines/panoptic_fpn_regnetx_0.4g_s.py
old mode 100755
new mode 100644
diff --git a/configs/taco/darknet53.yaml b/configs/taco/darknet53.yaml
old mode 100755
new mode 100644
diff --git a/configs/taco/r50.yaml b/configs/taco/r50.yaml
old mode 100755
new mode 100644
diff --git a/configs/tidal_plate/yolox_s.yaml b/configs/tidal_plate/yolox_s.yaml
new file mode 100644
index 0000000..928e446
--- /dev/null
+++ b/configs/tidal_plate/yolox_s.yaml
@@ -0,0 +1,98 @@
+_BASE_: "../Base-YOLOv7.yaml"
+MODEL:
+  PIXEL_MEAN: [0.485, 0.456, 0.406] # same value as PP-YOLOv2, RGB order
+  PIXEL_STD: [0.229, 0.224, 0.225]
+
+  WEIGHTS: ""
+  MASK_ON: False
+  META_ARCHITECTURE: "YOLOX"
+  BACKBONE:
+    NAME: "build_cspdarknetx_backbone"
+
+  DARKNET:
+    WEIGHTS: ""
+    DEPTH_WISE: False
+    OUT_FEATURES: ["dark3", "dark4", "dark5"]
+
+  YOLO:
+    CLASSES: 13
+    IN_FEATURES: ["dark3", "dark4", "dark5"]
+    # CONF_THRESHOLD: 0.01
+    CONF_THRESHOLD: 0.01
+    NMS_THRESHOLD: 0.65
+    IGNORE_THRESHOLD: 0.7
+    WIDTH_MUL: 0.50
+    DEPTH_MUL: 0.33
+    LOSS_TYPE: "v7"
+    LOSS:
+      LAMBDA_IOU: 1.5
+
+DATASETS:
+  TRAIN: ("tl_train",)
+  TEST: ("tl_val",)
+  CLASS_NAMES:
+    [
+      "tidal_plate.forward",
+      "tidal_plate.left",
+      "tidal_plate.right",
+      "tidal_plate.left_forward",
+      "tidal_plate.right_forward",
+      "tidal_plate.u_turn",
+      "tidal_plate.u_turn_left",
+      "tidal_plate.u_turn_forward",
+      "tidal_plate.u_turn_left_forward",
+      "tidal_plate.left_forward_right",
+      "tidal_plate.unknown",
+      "tidal_plate.cross_forbidden",
+      "tidal_plate.black_screen",
+    ]
+
+INPUT:
+  # FORMAT: "RGB" # using BGR default
+  MIN_SIZE_TRAIN: (416, 512, 608, 768, 800)
+  MAX_SIZE_TRAIN: 1920 # force max size train to 800?
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1920
+  # open all augmentations
+  JITTER_CROP:
+    ENABLED: False
+  RESIZE:
+    ENABLED: False
+    # SHAPE: (540, 960)
+  DISTORTION:
+    ENABLED: True
+  # MOSAIC:
+  #   ENABLED: True
+  #   NUM_IMAGES: 4
+  #   DEBUG_VIS: True
+  #   # MOSAIC_WIDTH: 960
+  #   # MOSAIC_HEIGHT: 540
+  MOSAIC_AND_MIXUP:
+    ENABLED: True
+    DEBUG_VIS: False
+    ENABLE_MIXUP: True
+    DISABLE_AT_ITER: 120000
+
+SOLVER:
+  # enable fp16 training
+  AMP:
+    ENABLED: true
+  IMS_PER_BATCH: 112
+  # IMS_PER_BATCH: 12
+  BASE_LR: 0.025
+  STEPS: (60000, 80000)
+  WARMUP_FACTOR: 0.00033333
+  WARMUP_ITERS: 1500
+  MAX_ITER: 150000
+  LR_SCHEDULER_NAME: "WarmupCosineLR"
+  CHECKPOINT_PERIOD: 5000
+
+TEST:
+  EVAL_PERIOD: 5000
+  # EVAL_PERIOD: 0
+OUTPUT_DIR: "output/tidal_plate_yolox_s"
+VIS_PERIOD: 5000
+
+DATALOADER:
+  # proposals are part of the dataset_dicts, and take a lot of RAM
+  NUM_WORKERS: 3
diff --git a/configs/tl/cspdarknet.yaml b/configs/tl/cspdarknet.yaml
old mode 100755
new mode 100644
diff --git a/configs/tl/darknet53.yaml b/configs/tl/darknet53.yaml
old mode 100755
new mode 100644
diff --git a/configs/tl/detr/detrt_256_6_6_regnetx_0.4g.yaml b/configs/tl/detr/detrt_256_6_6_regnetx_0.4g.yaml
new file mode 100644
index 0000000..a742b50
--- /dev/null
+++ b/configs/tl/detr/detrt_256_6_6_regnetx_0.4g.yaml
@@ -0,0 +1,58 @@
+MODEL:
+  META_ARCHITECTURE: "Detr"
+  WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  MASK_ON: False
+
+  BACKBONE:
+    NAME: "build_regnet_backbone"
+  REGNETS:
+    TYPE: "RegNetX_400MF"
+    OUT_FEATURES: ["s2", "s3", "s4"] # fpn produce 4 levels, only using 3 for now
+  # RESNETS:
+  #   DEPTH: 50
+  #   STRIDE_IN_1X1: False
+  #   OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+  DETR:
+    GIOU_WEIGHT: 2.0
+    L1_WEIGHT: 5.0
+    NUM_OBJECT_QUERIES: 100
+    ENC_LAYERS: 6
+    DEC_LAYERS: 6
+    HIDDEN_DIM: 256
+    CLASSES: 5
+
+DATASETS:
+  TRAIN: ("tl_train",)
+  TEST: ("tl_val",)
+
+SOLVER:
+  IMS_PER_BATCH: 56
+  BASE_LR: 0.001
+  STEPS: (369600,)
+  MAX_ITER: 554400
+  WARMUP_FACTOR: 1.0
+  WARMUP_ITERS: 10
+  WEIGHT_DECAY: 0.0001
+  OPTIMIZER: "ADAMW"
+  BACKBONE_MULTIPLIER: 0.1
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "norm"
+    CLIP_VALUE: 0.01
+    NORM_TYPE: 2.0
+INPUT:
+  MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
+  CROP:
+    ENABLED: True
+    TYPE: "absolute_range"
+    SIZE: (384, 600)
+  FORMAT: "RGB"
+TEST:
+  EVAL_PERIOD: 4000
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: False
+  NUM_WORKERS: 2
+VERSION: 2
+OUTPUT_DIR: "output/coco_detr_regx"
\ No newline at end of file
diff --git a/configs/tl/r2_50.yaml b/configs/tl/r2_50.yaml
old mode 100755
new mode 100644
diff --git a/configs/tl/r50.yaml b/configs/tl/r50.yaml
old mode 100755
new mode 100644
diff --git a/configs/tl/res2net_bifpn.yaml b/configs/tl/res2net_bifpn.yaml
old mode 100755
new mode 100644
diff --git a/configs/tl/res2net_fpn.yaml b/configs/tl/res2net_fpn.yaml
old mode 100755
new mode 100644
diff --git a/configs/tl/x_s_pafpn_1gpu.yaml b/configs/tl/x_s_pafpn_1gpu.yaml
old mode 100755
new mode 100644
diff --git a/configs/tl/yolov5_s.yaml b/configs/tl/yolov5_s.yaml
old mode 100755
new mode 100644
diff --git a/configs/tl/yolox_s.yaml b/configs/tl/yolox_s.yaml
old mode 100755
new mode 100644
index 3b48bab..1aca748
--- a/configs/tl/yolox_s.yaml
+++ b/configs/tl/yolox_s.yaml
@@ -57,7 +57,6 @@ INPUT:
     ENABLE_MIXUP: True
     DISABLE_AT_ITER: 120000
 
-
 SOLVER:
   # enable fp16 training
   AMP:
@@ -68,15 +67,16 @@ SOLVER:
   STEPS: (60000, 80000)
   WARMUP_FACTOR: 0.00033333
   WARMUP_ITERS: 1500
-  MAX_ITER: 230000
+  MAX_ITER: 150000
   LR_SCHEDULER_NAME: "WarmupCosineLR"
+  CHECKPOINT_PERIOD: 5000
 
 TEST:
-  EVAL_PERIOD: 10000
+  EVAL_PERIOD: 5000
   # EVAL_PERIOD: 0
 OUTPUT_DIR: "output/tl_yolox_s"
 VIS_PERIOD: 5000
 
 DATALOADER:
   # proposals are part of the dataset_dicts, and take a lot of RAM
-  NUM_WORKERS: 1
+  NUM_WORKERS: 3
diff --git a/configs/tl/yolox_s_1gpu.yaml b/configs/tl/yolox_s_1gpu.yaml
old mode 100755
new mode 100644
diff --git a/configs/visdrone/r2_50_1gpu.yaml b/configs/visdrone/r2_50_1gpu.yaml
old mode 100755
new mode 100644
diff --git a/configs/visdrone/yolov5_s.yaml b/configs/visdrone/yolov5_s.yaml
old mode 100755
new mode 100644
diff --git a/configs/visdrone/yolox_s_1gpu.yaml b/configs/visdrone/yolox_s_1gpu.yaml
old mode 100755
new mode 100644
diff --git a/configs/voc/darknet53_1gpu.yaml b/configs/voc/darknet53_1gpu.yaml
old mode 100755
new mode 100644
diff --git a/configs/voc/r2_50_1gpu.yaml b/configs/voc/r2_50_1gpu.yaml
old mode 100755
new mode 100644
diff --git a/configs/voc/x_s_pafpn_1gpu.yaml b/configs/voc/x_s_pafpn_1gpu.yaml
old mode 100755
new mode 100644
diff --git a/configs/voc/yolov5_s.yaml b/configs/voc/yolov5_s.yaml
old mode 100755
new mode 100644
diff --git a/configs/voc/yolox_s_1gpu.yaml b/configs/voc/yolox_s_1gpu.yaml
old mode 100755
new mode 100644
diff --git a/configs/wearmask/cspdarknet53.yaml b/configs/wearmask/cspdarknet53.yaml
old mode 100755
new mode 100644
index ea02c5c..7d39da0
--- a/configs/wearmask/cspdarknet53.yaml
+++ b/configs/wearmask/cspdarknet53.yaml
@@ -1,4 +1,4 @@
-_BASE_: "../Base-YoloV7.yaml"
+_BASE_: "../Base-YOLOv7.yaml"
 MODEL:
   META_ARCHITECTURE: "YOLOV7"
   WEIGHTS: ""
diff --git a/configs/wearmask/cspdarknet53_1gpu.yaml b/configs/wearmask/cspdarknet53_1gpu.yaml
old mode 100755
new mode 100644
index f682fb5..bee56b9
--- a/configs/wearmask/cspdarknet53_1gpu.yaml
+++ b/configs/wearmask/cspdarknet53_1gpu.yaml
@@ -1,4 +1,4 @@
-_BASE_: "../Base-YoloV7.yaml"
+_BASE_: "../Base-YOLOv7.yaml"
 MODEL:
   WEIGHTS: ""
   META_ARCHITECTURE: "YOLOV7"
diff --git a/configs/wearmask/darknet53.yaml b/configs/wearmask/darknet53.yaml
old mode 100755
new mode 100644
index 4fc2321..a2636da
--- a/configs/wearmask/darknet53.yaml
+++ b/configs/wearmask/darknet53.yaml
@@ -1,4 +1,4 @@
-_BASE_: "../Base-YoloV7.yaml"
+_BASE_: "../Base-YOLOv7.yaml"
 MODEL:
   WEIGHTS: ""
   MASK_ON: False
diff --git a/configs/wearmask/efficient_b2.yaml b/configs/wearmask/efficient_b2.yaml
old mode 100755
new mode 100644
index e82d62e..da4edf9
--- a/configs/wearmask/efficient_b2.yaml
+++ b/configs/wearmask/efficient_b2.yaml
@@ -1,4 +1,4 @@
-_BASE_: "../Base-YoloV7.yaml"
+_BASE_: "../Base-YOLOv7.yaml"
 MODEL:
   META_ARCHITECTURE: "YOLOV7"
   WEIGHTS: ""
diff --git a/configs/wearmask/r50.yaml b/configs/wearmask/r50.yaml
old mode 100755
new mode 100644
index b213d41..8666ab2
--- a/configs/wearmask/r50.yaml
+++ b/configs/wearmask/r50.yaml
@@ -1,4 +1,4 @@
-_BASE_: "../Base-YoloV7.yaml"
+_BASE_: "../Base-YOLOv7.yaml"
 MODEL:
   META_ARCHITECTURE: "YOLOV7"
   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
diff --git a/configs/wearmask/r50_1gpu.yaml b/configs/wearmask/r50_1gpu.yaml
old mode 100755
new mode 100644
index 16d225a..812c2c4
--- a/configs/wearmask/r50_1gpu.yaml
+++ b/configs/wearmask/r50_1gpu.yaml
@@ -1,4 +1,4 @@
-_BASE_: "../Base-YoloV7.yaml"
+_BASE_: "../Base-YOLOv7.yaml"
 MODEL:
   META_ARCHITECTURE: "YOLOV7"
   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
diff --git a/configs/wearmask/r50_bifpn.yaml b/configs/wearmask/r50_bifpn.yaml
old mode 100755
new mode 100644
diff --git a/configs/wearmask/r50_pan.yaml b/configs/wearmask/r50_pan.yaml
old mode 100755
new mode 100644
index dcc3f7c..3c3a907
--- a/configs/wearmask/r50_pan.yaml
+++ b/configs/wearmask/r50_pan.yaml
@@ -1,4 +1,4 @@
-_BASE_: "../Base-YoloV7.yaml"
+_BASE_: "../Base-YOLOv7.yaml"
 MODEL:
   META_ARCHITECTURE: "YOLOV7"
   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
diff --git a/configs/wearmask/yolov5_s.yaml b/configs/wearmask/yolov5_s.yaml
old mode 100755
new mode 100644
diff --git a/demo.py b/demo.py
old mode 100755
new mode 100644
index a70d633..12ef5f0
--- a/demo.py
+++ b/demo.py
@@ -1,37 +1,30 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
 import argparse
-import glob
 import multiprocessing as mp
-import os
+import pathlib
+import random
 import time
+
 import cv2
-from numpy.core.fromnumeric import sort
-import tqdm
+import detectron2.data.transforms as T
 import torch
-import time
-import random
-from detectron2.data.detection_utils import read_image
-from detectron2.utils.logger import setup_logger
-
-import numpy as np
-from detectron2.data.catalog import MetadataCatalog
+from alfred.utils.file_io import ImageSourceIter
+from alfred.vis.image.det import visualize_det_cv2_part
+from alfred.vis.image.mask import vis_bitmasks_with_classes
+from detectron2.checkpoint import DetectionCheckpointer
 from detectron2.config import get_cfg
+from detectron2.data.catalog import MetadataCatalog
 from detectron2.modeling import build_model
-import detectron2.data.transforms as T
-from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.structures.masks import BitMasks
+from detectron2.utils.logger import setup_logger
+from tqdm import trange
 
 from yolov7.config import add_yolo_config
 
-
-from alfred.vis.image.mask import label2color_mask, vis_bitmasks, vis_bitmasks_with_classes
-from alfred.vis.image.det import visualize_det_cv2_part, visualize_det_cv2_fancy
-
 # constants
 WINDOW_NAME = "COCO detections"
 
 
 class DefaultPredictor:
-
     def __init__(self, cfg):
         self.cfg = cfg.clone()  # cfg can be modified by model
         self.model = build_model(self.cfg)
@@ -54,9 +47,8 @@ def __call__(self, original_image):
             if self.input_format == "RGB":
                 original_image = original_image[:, :, ::-1]
             height, width = original_image.shape[:2]
-            image = self.aug.get_transform(
-                original_image).apply_image(original_image)
-            print('image after transform: ', image.shape)
+            image = self.aug.get_transform(original_image).apply_image(original_image)
+            print("image after transform: ", image.shape)
             image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1))
             inputs = {"image": image, "height": height, "width": width}
             tic = time.time()
@@ -64,12 +56,11 @@ def __call__(self, original_image):
             predictions = self.model([inputs])
             predictions = predictions[0]
             c = time.time() - tic
-            print('cost: {}, fps: {}'.format(c, 1/c))
+            print("cost: {}, fps: {}".format(c, 1 / c))
             return predictions
 
 
 def setup_cfg(args):
-    # load config from file and command-line arguments
     cfg = get_cfg()
     add_yolo_config(cfg)
     cfg.merge_from_file(args.config_file)
@@ -78,55 +69,64 @@ def setup_cfg(args):
     cfg.MODEL.YOLO.CONF_THRESHOLD = args.confidence_threshold
     cfg.MODEL.YOLO.NMS_THRESHOLD = args.nms_threshold
     cfg.MODEL.YOLO.IGNORE_THRESHOLD = 0.1
-
-    # cfg.INPUT.MIN_SIZE_TEST = 672  # 90ms
-    # cfg.INPUT.MIN_SIZE_TEST = 2560  # 90ms
-    # cfg.INPUT.MAX_SIZE_TEST = 3060  # 90ms
-    cfg.INPUT.MAX_SIZE_TEST = 800  # 90ms
-    # cfg.INPUT.MIN_SIZE_TEST = 512 # 70ms
-    # cfg.INPUT.MIN_SIZE_TEST = 1080  # 40ms
-    # cfg.INPUT.MAX_SIZE_TEST = 512 # 40ms
-    # cfg.INPUT.MAX_SIZE_TEST = 1080  # 70ms
+    # force devices based on user device
+    cfg.MODEL.DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+    cfg.INPUT.MAX_SIZE_TEST = 600  # 90ms
     cfg.freeze()
     return cfg
 
 
 def get_parser():
-    parser = argparse.ArgumentParser(
-        description="Detectron2 demo for builtin configs")
+    parser = argparse.ArgumentParser(description="Detectron2 demo for builtin configs")
     parser.add_argument(
         "--config-file",
         default="configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml",
         metavar="FILE",
         help="path to config file",
     )
-    parser.add_argument("--webcam", action="store_true",
-                        help="Take inputs from webcam.")
-    parser.add_argument("--video-input", help="Path to video file.")
     parser.add_argument(
+        "--webcam", action="store_true", help="Take inputs from webcam."
+    )
+    parser.add_argument(
+        "-i",
         "--input",
         # nargs="+",
         help="A list of space separated input images; "
         "or a single glob pattern such as 'directory/*.jpg'",
     )
     parser.add_argument(
+        "-o",
         "--output",
         help="A file or directory to save output visualizations. "
         "If not given, will show output in an OpenCV window.",
     )
 
     parser.add_argument(
-        '-c', "--confidence-threshold",
+        "-c",
+        "--confidence-threshold",
         type=float,
         default=0.21,
         help="Minimum score for instance predictions to be shown",
     )
     parser.add_argument(
-        '-n', "--nms-threshold",
+        "-n",
+        "--nms-threshold",
         type=float,
         default=0.6,
         help="Minimum score for instance predictions to be shown",
     )
+    parser.add_argument(
+        "--wandb-project",
+        type=str,
+        default=None,
+        help="Name of Weights & Biases Project.",
+    )
+    parser.add_argument(
+        "--wandb-entity",
+        type=str,
+        default=None,
+        help="Name of Weights & Biases Entity.",
+    )
     parser.add_argument(
         "--opts",
         help="Modify config options using the command-line 'KEY VALUE' pairs",
@@ -136,27 +136,50 @@ def get_parser():
     return parser
 
 
-def vis_res_fast(res, img, meta, colors):
-    ins = res['instances']
-    bboxes = ins.pred_boxes.tensor.cpu().numpy()
+def vis_res_fast(res, img, class_names, colors, thresh):
+    ins = res["instances"]
+    bboxes = None
+    if ins.has("pred_boxes"):
+        bboxes = ins.pred_boxes.tensor.cpu().numpy()
     scores = ins.scores.cpu().numpy()
     clss = ins.pred_classes.cpu().numpy()
-
-    if ins.has('pred_bit_masks'):
-        # img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-        # img = np.stack((img,)*3, axis=-1)
-
+    if ins.has("pred_bit_masks"):
         bit_masks = ins.pred_bit_masks
+        if isinstance(bit_masks, BitMasks):
+            bit_masks = bit_masks.tensor.cpu().numpy()
         # img = vis_bitmasks_with_classes(img, clss, bit_masks)
         # img = vis_bitmasks_with_classes(img, clss, bit_masks, force_colors=colors, mask_border_color=(255, 255, 255), thickness=2)
-        # img = vis_bitmasks_with_classes(img, clss, bit_masks, force_colors=None, mask_border_color=None, thickness=2)
-        # img = vis_bitmasks(img, bit_masks, thickness=2, draw_contours=False)
-        img = vis_bitmasks(img, bit_masks, thickness=2, draw_contours=True, fill_mask=True)
-    # print('img shape: ', img.shape)
-    thickness = 1 if ins.has('pred_bit_masks') else 2
-    font_scale = 0.3 if ins.has('pred_bit_masks') else 0.4
-    img = visualize_det_cv2_part(
-        img, scores, clss, bboxes, force_color=colors, line_thickness=thickness, font_scale=font_scale)
+        img = vis_bitmasks_with_classes(
+            img, clss, bit_masks, force_colors=None, draw_contours=True, alpha=0.8
+        )
+
+    if ins.has("pred_masks"):
+        bit_masks = ins.pred_masks
+        if isinstance(bit_masks, BitMasks):
+            bit_masks = bit_masks.tensor.cpu().numpy()
+        img = vis_bitmasks_with_classes(
+            img,
+            clss,
+            bit_masks,
+            force_colors=None,
+            draw_contours=True,
+            alpha=0.6,
+            thickness=2,
+        )
+    thickness = 1 if ins.has("pred_bit_masks") else 2
+    font_scale = 0.3 if ins.has("pred_bit_masks") else 0.4
+    if bboxes is not None:
+        img = visualize_det_cv2_part(
+            img,
+            scores,
+            clss,
+            bboxes,
+            class_names=class_names,
+            force_color=colors,
+            line_thickness=thickness,
+            font_scale=font_scale,
+            thresh=thresh,
+        )
     # img = cv2.addWeighted(img, 0.9, m, 0.6, 0.9)
     return img
 
@@ -169,51 +192,55 @@ def vis_res_fast(res, img, meta, colors):
     logger.info("Arguments: " + str(args))
 
     cfg = setup_cfg(args)
-
     metadata = MetadataCatalog.get(cfg.DATASETS.TEST[0])
+    class_names = cfg.DATASETS.CLASS_NAMES
     predictor = DefaultPredictor(cfg)
 
     print(cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MAX_SIZE_TEST)
-    colors = [[random.randint(0, 255) for _ in range(3)]
-              for _ in range(cfg.MODEL.YOLO.CLASSES)]
-
-    if args.input:
-        if os.path.isdir(args.input):
-            imgs = glob.glob(os.path.join(args.input, '*.jpg'))
-            imgs = sorted(imgs)
-            for path in imgs:
-                # use PIL, to be consistent with evaluation
-                img = cv2.imread(path)
-                print('ori img shape: ', img.shape)
-                res = predictor(img)
-                res = vis_res_fast(res, img, metadata, colors)
-                # cv2.imshow('frame', res)
-                cv2.imshow('frame', res)
-                if cv2.waitKey(0) & 0xFF == ord('q'):
-                    break
+    colors = [
+        [random.randint(0, 255) for _ in range(3)]
+        for _ in range(cfg.MODEL.YOLO.CLASSES)
+    ]
+    conf_thresh = cfg.MODEL.YOLO.CONF_THRESHOLD
+    print("confidence thresh: ", conf_thresh)
+
+    iter = ImageSourceIter(args.input)
+    if args.wandb_project is not None:
+        from wandadb.wandb_logger import WandbInferenceLogger
+
+        inference_logger = WandbInferenceLogger(
+            wandb_entity=args.wandb_entity,
+            wandb_project=args.wandb_project,
+            conf_threshold=args.confidence_threshold,
+            config=cfg,
+        )
+    else:
+        inference_logger = None
+
+    for i in trange(len(iter.srcs)):
+        im = next(iter)
+        if isinstance(im, str):
+            image_path = im
+            im = cv2.imread(im)
+            res = predictor(im)
+            if inference_logger:
+                inference_logger.log_inference(image_path, res)
+
+            res = vis_res_fast(res, im, class_names, colors, conf_thresh)
+        # cv2.imshow('frame', res)
+        if args.output:
+            if pathlib.Path(args.output).is_dir():
+                out_path = pathlib.Path(args.output) / pathlib.Path(image_path).name
+            else:
+                out_path = args.output
+        else:
+            out_path = "frame"
+        cv2.imshow(out_path, res)
+
+        if iter.video_mode:
+            cv2.waitKey(1)
         else:
-            img = cv2.imread(args.input)
-            res = predictor(img)
-            res = vis_res_fast(res, img, metadata, colors)
-            # cv2.imshow('frame', res)
-            cv2.imshow('frame', res)
-            cv2.waitKey(0)
-    elif args.webcam:
-        print('Not supported.')
-    elif args.video_input:
-        video = cv2.VideoCapture(args.video_input)
-        width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
-        height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
-        frames_per_second = video.get(cv2.CAP_PROP_FPS)
-        num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
-        basename = os.path.basename(args.video_input)
-
-        while(video.isOpened()):
-            ret, frame = video.read()
-            # frame = cv2.resize(frame, (640, 640))
-            res = predictor(frame)
-            res = vis_res_fast(res, frame, metadata, colors)
-            # cv2.imshow('frame', res)
-            cv2.imshow('frame', res)
-            if cv2.waitKey(1) & 0xFF == ord('q'):
-                break
+            if cv2.waitKey(0) & 0xFF == ord("q"):
+                continue
+    if inference_logger:
+        inference_logger.finish_run()
diff --git a/demo_lazyconfig.py b/demo_lazyconfig.py
old mode 100755
new mode 100644
diff --git a/deploy/configs/r18.yaml b/deploy/configs/r18.yaml
new file mode 100644
index 0000000..4a42480
--- /dev/null
+++ b/deploy/configs/r18.yaml
@@ -0,0 +1,42 @@
+extra_prepare_dict:
+    extra_qconfig_dict:
+        w_observer: MinMaxObserver
+        a_observer: EMAMinMaxObserver
+        w_fakequantize: FixedFakeQuantize
+        a_fakequantize: FixedFakeQuantize
+        w_qscheme:
+            bit: 8
+            # symmetry: False
+            symmetry: true
+            per_channel: True
+            pot_scale: False
+        a_qscheme:
+            bit: 8
+            # symmetry: False
+            symmetry: true
+            per_channel: False
+            pot_scale: False
+quantize:
+    quantize_type: naive_ptq # support naive_ptq or advanced_ptq
+    cali_batchsize: 16
+    backend: 'Tensorrt'
+    # backend: 'ONNX_QNN'
+    # backend: 'PPLW8A16'
+    deploy:
+        model_name: 'r18.onnx'
+        output_path: './'
+        deploy_to_qlinear: true
+model:                    # architecture details
+    type: resnet18        # model name
+    kwargs:
+        num_classes: 1000
+    path: /path-of-pretrained
+data:
+    path: /path-of-imagenet
+    batch_size: 64
+    num_workers: 4
+    pin_memory: True
+    input_size: 224
+    test_resize: 256
+process:
+    seed: 1005
\ No newline at end of file
diff --git a/deploy/demo_quantized_int8.py b/deploy/demo_quantized_int8.py
new file mode 100644
index 0000000..2535031
--- /dev/null
+++ b/deploy/demo_quantized_int8.py
@@ -0,0 +1,161 @@
+from torch import Tensor
+from wanwu.core.backends.trt import TensorRTInferencer
+import os
+import cv2
+import argparse
+import numpy as np
+import onnxruntime
+from alfred.vis.image.det import visualize_det_cv2_part
+from alfred.vis.image.mask import vis_bitmasks_with_classes
+from alfred.utils.file_io import ImageSourceIter
+
+
+def vis_res_fast(img, boxes, masks, scores, labels):
+    if masks is not None:
+        # masks shape, might not same as img, resize contours if so
+        img = vis_bitmasks_with_classes(
+            img,
+            labels,
+            masks,
+            force_colors=None,
+            draw_contours=True,
+            mask_border_color=[255, 255, 255],
+        )
+    thickness = 1 if masks is None else 2
+    font_scale = 0.3 if masks is None else 0.4
+    if boxes:
+        img = visualize_det_cv2_part(
+            img,
+            scores,
+            labels,
+            boxes,
+            line_thickness=thickness,
+            font_scale=font_scale,
+        )
+    return img
+
+
+def load_test_image(f, h, w):
+    a = cv2.imread(f)
+    a = cv2.resize(a, (w, h))
+    a_t = np.expand_dims(np.array(a).astype(np.float32), axis=0)
+    return a_t, a
+
+
+def preprocess_image(img, h, w):
+    a = cv2.resize(img, (w, h))
+    a_t = np.expand_dims(np.array(a).astype(np.float32), axis=0)
+    return a_t, img
+
+
+def make_parser():
+    parser = argparse.ArgumentParser("onnxruntime inference sample")
+    parser.add_argument(
+        "-m",
+        "--model",
+        type=str,
+        default="yolox.onnx",
+        help="Input your onnx model.",
+    )
+    parser.add_argument(
+        "-i",
+        "--image_path",
+        type=str,
+        default="test_image.png",
+        help="Path to your input image.",
+    )
+    parser.add_argument(
+        "-o",
+        "--output_dir",
+        type=str,
+        default="demo_output",
+        help="Path to your output directory.",
+    )
+    parser.add_argument(
+        "-s",
+        "--score_thr",
+        type=float,
+        default=0.3,
+        help="Score threshould to filter the result.",
+    )
+    parser.add_argument(
+        "-t",
+        "--type",
+        default='sparseinst',
+        help="model type.",
+    )
+    return parser
+
+
+if __name__ == "__main__":
+    args = make_parser().parse_args()
+    
+    engine_f = args.model
+    trt_model = TensorRTInferencer(engine_f)
+    input_shape = trt_model.ori_input_shape
+    print('input shape: ', input_shape)
+
+    iter = ImageSourceIter(args.image_path)
+    while True:
+        im = next(iter)
+        if isinstance(im, str):
+            im = cv2.imread(im)
+
+        inp, ori_img = preprocess_image(im, h=input_shape[0], w=input_shape[1])
+        output = trt_model.infer(inp)
+
+        print(output)
+
+        if "sparse" in args.type:
+            masks, scores, labels = None, None, None
+            for o in output:
+                if o.dtype == np.float32:
+                    scores = o
+                if o.dtype == np.int32 or o.dtype == np.int64:
+                    labels = o
+                if o.dtype == bool:
+                    masks = o
+            masks = masks[0]
+            print(masks.shape)
+            if len(masks.shape) > 3:
+                masks = np.squeeze(masks, axis=1)
+            scores = scores[0]
+            labels = labels[0]
+            # keep = scores > 0.15
+            keep = scores > 0.06
+            scores = scores[keep]
+            labels = labels[keep]
+            masks = masks[keep]
+            print(scores)
+            print(labels)
+            print(masks.shape)
+            img = vis_res_fast(im, None, masks, scores, labels)
+        else:
+            predictions = demo_postprocess(output[0], input_shape, p6=args.with_p6)[0]
+            boxes = predictions[:, :4]
+            scores = predictions[:, 4:5] * predictions[:, 5:]
+
+            boxes_xyxy = np.ones_like(boxes)
+            boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2] / 2.0
+            boxes_xyxy[:, 1] = boxes[:, 1] - boxes[:, 3] / 2.0
+            boxes_xyxy[:, 2] = boxes[:, 0] + boxes[:, 2] / 2.0
+            boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3] / 2.0
+            # boxes_xyxy /= ratio
+            dets = multiclass_nms(boxes_xyxy, scores, nms_thr=0.65, score_thr=0.1)
+            final_boxes, final_scores, final_cls_inds = (
+                dets[:, :4],
+                dets[:, 4],
+                dets[:, 5],
+            )
+            img = visualize_det_cv2_part(
+                ori_img, final_scores, final_cls_inds, final_boxes
+            )
+            cv2.imshow("aa", img)
+            cv2.waitKey(0)
+
+        cv2.imshow("YOLOv7 SparseInst CPU int8", img)
+        if iter.video_mode:
+            if cv2.waitKey(1) & 0xFF == ord("q"):
+                break
+        else:
+            cv2.waitKey(0)
diff --git a/deploy/onnxrt_infer.py b/deploy/ort_infer.py
old mode 100755
new mode 100644
similarity index 53%
rename from deploy/onnxrt_infer.py
rename to deploy/ort_infer.py
index 60f46fe..b4f1fc4
--- a/deploy/onnxrt_infer.py
+++ b/deploy/ort_infer.py
@@ -1,12 +1,12 @@
-
-
 import argparse
+from cProfile import label
 import os
 import cv2
 import numpy as np
 import onnxruntime
 from alfred.vis.image.det import visualize_det_cv2_part
-
+from alfred.vis.image.mask import vis_bitmasks_with_classes
+from alfred.utils.file_io import ImageSourceIter
 
 """
 
@@ -62,7 +62,8 @@ def multiclass_nms(boxes, scores, nms_thr, score_thr):
             if len(keep) > 0:
                 cls_inds = np.ones((len(keep), 1)) * cls_ind
                 dets = np.concatenate(
-                    [valid_boxes[keep], valid_scores[keep, None], cls_inds], 1)
+                    [valid_boxes[keep], valid_scores[keep, None], cls_inds], 1
+                )
                 final_dets.append(dets)
     return np.concatenate(final_dets, 0)
 
@@ -76,8 +77,8 @@ def demo_postprocess(outputs, img_size, p6=False):
     else:
         strides = [8, 16, 32, 64]
 
-    hsizes = [img_size[0]//stride for stride in strides]
-    wsizes = [img_size[1]//stride for stride in strides]
+    hsizes = [img_size[0] // stride for stride in strides]
+    wsizes = [img_size[1] // stride for stride in strides]
 
     for hsize, wsize, stride in zip(hsizes, wsizes, strides):
         # xv, yv = np.meshgrid(np.arange(hsize), np.arange(wsize))
@@ -103,12 +104,14 @@ def preproc(image, input_size, mean, std, swap=(2, 0, 1)):
     img = np.array(image)
     r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
     resized_img = cv2.resize(
-        img, (int(img.shape[1] * r), int(img.shape[0] * r)), interpolation=cv2.INTER_LINEAR
+        img,
+        (int(img.shape[1] * r), int(img.shape[0] * r)),
+        interpolation=cv2.INTER_LINEAR,
     ).astype(np.float32)
     padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
     image = padded_img
 
-    cv2.imshow('aad', image.astype(np.uint8))
+    cv2.imshow("aad", image.astype(np.uint8))
     # cv2.waitKey()
 
     image = image.astype(np.float32)
@@ -136,14 +139,14 @@ def make_parser():
         "-i",
         "--image_path",
         type=str,
-        default='test_image.png',
+        default="test_image.png",
         help="Path to your input image.",
     )
     parser.add_argument(
         "-o",
         "--output_dir",
         type=str,
-        default='demo_output',
+        default="demo_output",
         help="Path to your output directory.",
     )
     parser.add_argument(
@@ -164,48 +167,119 @@ def make_parser():
         action="store_true",
         help="Whether your model uses p6 in FPN/PAN.",
     )
+    parser.add_argument(
+        "-int8",
+        '--int8',
+        action="store_true",
+        help="Whether your model uses int8.",
+    )
     return parser
 
 
+def vis_res_fast(img, boxes, masks, scores, labels):
+    if masks is not None:
+        # masks shape, might not same as img, resize contours if so
+        img = vis_bitmasks_with_classes(
+            img,
+            labels,
+            masks,
+            force_colors=None,
+            draw_contours=True,
+            mask_border_color=[255, 255, 255],
+        )
+    thickness = 1 if masks is None else 2
+    font_scale = 0.3 if masks is None else 0.4
+    if boxes:
+        img = visualize_det_cv2_part(
+            img,
+            scores,
+            labels,
+            boxes,
+            line_thickness=thickness,
+            font_scale=font_scale,
+        )
+    return img
+
+
 def load_test_image(f, h, w):
     a = cv2.imread(f)
     a = cv2.resize(a, (w, h))
-    a_t = [a]
+    a_t = np.expand_dims(np.array(a).astype(np.float32), axis=0)
     return a_t, a
 
 
-if __name__ == '__main__':
-    args = make_parser().parse_args()
+def preprocess_image(img, h, w):
+    a = cv2.resize(img, (w, h))
+    a_t = np.expand_dims(np.array(a).astype(np.float32), axis=0)
+    return a_t, img
 
-    input_shape = tuple(map(int, args.input_shape.split(',')))
-    # origin_img = cv2.imread(args.image_path)
-    # img, ratio = preproc(origin_img, input_shape)
-    inp, ori_img = load_test_image(args.image_path, h=input_shape[0], w=input_shape[1])
 
+if __name__ == "__main__":
+    args = make_parser().parse_args()
+    input_shape = tuple(map(int, args.input_shape.split(",")))
     session = onnxruntime.InferenceSession(args.model)
 
-    ort_inputs = {session.get_inputs()[0].name: inp}
-    output = session.run(None, ort_inputs)
-    print(output[0].shape)
-
-    predictions = demo_postprocess(output[0], input_shape, p6=args.with_p6)[0]
-
-    boxes = predictions[:, :4]
-    scores = predictions[:, 4:5] * predictions[:, 5:]
-
-    boxes_xyxy = np.ones_like(boxes)
-    boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2]/2.
-    boxes_xyxy[:, 1] = boxes[:, 1] - boxes[:, 3]/2.
-    boxes_xyxy[:, 2] = boxes[:, 0] + boxes[:, 2]/2.
-    boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3]/2.
-    # boxes_xyxy /= ratio
-    dets = multiclass_nms(boxes_xyxy, scores, nms_thr=0.65, score_thr=0.1)
-
-    final_boxes, final_scores, final_cls_inds = dets[:,
-                                                     :4], dets[:, 4], dets[:, 5]
-
-    img = visualize_det_cv2_part(
-        ori_img, final_scores, final_cls_inds, final_boxes)
-
-    cv2.imshow('aa', img)
-    cv2.waitKey(0)
+    iter = ImageSourceIter(args.image_path)
+    while True:
+        im = next(iter)
+        if isinstance(im, str):
+            im = cv2.imread(im)
+
+        inp, ori_img = preprocess_image(im, h=input_shape[0], w=input_shape[1])
+
+        ort_inputs = {session.get_inputs()[0].name: inp}
+        output = session.run(None, ort_inputs)
+
+        if "sparse" in args.model:
+            masks, scores, labels = None, None, None
+            for o in output:
+                if o.dtype == np.float32:
+                    scores = o
+                if o.dtype == np.int32 or o.dtype == np.int64:
+                    labels = o
+                if o.dtype == bool:
+                    masks = o
+            masks = masks[0]
+            print(masks.shape)
+            if len(masks.shape) > 3:
+                masks = np.squeeze(masks, axis=1)
+            scores = scores[0]
+            labels = labels[0]
+            # keep = scores > 0.15
+            keep = scores > (0.13 if args.int8 else 0.32)
+            scores = scores[keep]
+            labels = labels[keep]
+            masks = masks[keep]
+            print(scores)
+            print(labels)
+            print(masks.shape)
+            img = vis_res_fast(im, None, masks, scores, labels)
+        else:
+            predictions = demo_postprocess(output[0], input_shape, p6=args.with_p6)[0]
+            boxes = predictions[:, :4]
+            scores = predictions[:, 4:5] * predictions[:, 5:]
+
+            boxes_xyxy = np.ones_like(boxes)
+            boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2] / 2.0
+            boxes_xyxy[:, 1] = boxes[:, 1] - boxes[:, 3] / 2.0
+            boxes_xyxy[:, 2] = boxes[:, 0] + boxes[:, 2] / 2.0
+            boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3] / 2.0
+            # boxes_xyxy /= ratio
+            dets = multiclass_nms(boxes_xyxy, scores, nms_thr=0.65, score_thr=0.1)
+            final_boxes, final_scores, final_cls_inds = (
+                dets[:, :4],
+                dets[:, 4],
+                dets[:, 5],
+            )
+            img = visualize_det_cv2_part(
+                ori_img, final_scores, final_cls_inds, final_boxes
+            )
+            cv2.imshow("aa", img)
+            cv2.waitKey(0)
+
+        cv2.imshow("YOLOv7 SparseInst CPU int8", img)
+        if iter.video_mode:
+            if cv2.waitKey(1) & 0xFF == ord("q"):
+                break
+        else:
+            cv2.waitKey(0)
diff --git a/deploy/prune_nni.py b/deploy/prune_nni.py
old mode 100755
new mode 100644
diff --git a/deploy/quant_atom/.gitignore b/deploy/quant_atom/.gitignore
new file mode 100644
index 0000000..2fe317a
--- /dev/null
+++ b/deploy/quant_atom/.gitignore
@@ -0,0 +1 @@
+Output/
diff --git a/deploy/quant_atom/qt_ppq_sinst.py b/deploy/quant_atom/qt_ppq_sinst.py
new file mode 100644
index 0000000..74fcb72
--- /dev/null
+++ b/deploy/quant_atom/qt_ppq_sinst.py
@@ -0,0 +1,153 @@
+"""
+
+Examples on how to quantize with PPQ
+
+I dont suggest you using PPQ, it has a lot of bugs.
+
+"""
+from typing import Iterable
+
+from loguru import logger
+import torch
+from torch.utils.data import DataLoader
+from ppq import BaseGraph, QuantizationSettingFactory, TargetPlatform
+from ppq import graphwise_error_analyse, layerwise_error_analyse
+from ppq.api import (
+    export_ppq_graph,
+    quantize_onnx_model
+)
+import sys
+from torchvision import transforms
+import torchvision
+import torch
+from atomquant.onnx.dataloader import get_calib_dataloader_coco
+import os
+import cv2
+import numpy as np
+import onnxruntime as ort
+from torchvision.datasets.coco import CocoDetection
+from alfred.dl.torch.common import device
+
+
+def preprocess_func(img, target):
+    w = 640
+    h = 640
+    a = cv2.resize(img, (w, h))
+    a_t = np.array(a).astype(np.float32)
+    boxes = []
+    for t in target:
+        boxes.append(t["bbox"])
+    target = np.array(boxes)
+    a_t = torch.as_tensor(a_t)
+    target = torch.as_tensor(target)
+    return a_t, target
+
+
+def collate_fn(batch):
+    images, targets = zip(*batch)
+    if isinstance(images[0], torch.Tensor):
+        images = torch.stack(images)
+        targets = torch.stack(targets)
+    else:
+        images = np.array(images)
+    return images
+
+
+if __name__ == "__main__":
+    ONNX_PATH = sys.argv[1]
+
+    coco_root = os.path.expanduser("~/data/coco/images/val2017")
+    anno_f = os.path.expanduser(
+        "~/data/coco/annotations/instances_val2017_val_val_train.json"
+    )
+
+    # coco_ds = CocoDetection(coco_root, anno_f, )
+
+    session = ort.InferenceSession(ONNX_PATH)
+    input_name = session.get_inputs()[0].name
+
+    calib_dataloader = get_calib_dataloader_coco(
+        coco_root,
+        anno_f,
+        preprocess_func=preprocess_func,
+        input_names=input_name,
+        bs=1,
+        max_step=50,
+        collate_fn=collate_fn
+    )
+
+    REQUIRE_ANALYSE = False
+    BATCHSIZE = 1
+    # INPUT_SHAPE = [3, 224, 224]
+    INPUT_SHAPE = [640, 640, 3]
+    DEVICE = "cuda"  
+    PLATFORM = (
+        # TargetPlatform.ORT_OOS_INT8
+        TargetPlatform.TRT_INT8
+    ) 
+    EXECUTING_DEVICE = "cpu"  # 'cuda' or 'cpu'.
+
+    # create a setting for quantizing your network with PPL CUDA.
+    # quant_setting = QuantizationSettingFactory.pplcuda_setting()
+    quant_setting = QuantizationSettingFactory.default_setting()
+    # quant_setting.equalization = True  # use layerwise equalization algorithm.
+    quant_setting.equalization = False  # tensorrt false
+    quant_setting.dispatcher = (
+        "conservative"  # dispatch this network in conservertive way.
+    )
+
+    
+    # quantize your model.
+    quantized = quantize_onnx_model(
+        onnx_import_file=ONNX_PATH,
+        calib_dataloader=calib_dataloader.dataloader_holder,
+        calib_steps=120,
+        input_shape=[BATCHSIZE] + INPUT_SHAPE,
+        setting=quant_setting,
+        # collate_fn=collate_fn,
+        platform=PLATFORM,
+        device=DEVICE,
+        verbose=0,
+    )
+
+    # Quantization Result is a PPQ BaseGraph instance.
+    assert isinstance(quantized, BaseGraph)
+
+    try:
+        if REQUIRE_ANALYSE:
+            print("正计算网络量化误差(SNR)，最后一层的误差应小于 0.1 以保证量化精度:")
+            reports = graphwise_error_analyse(
+                graph=quantized,
+                running_device=EXECUTING_DEVICE,
+                steps=32,
+                dataloader=calib_dataloader.dataloader_holder,
+                collate_fn=lambda x: x.to(EXECUTING_DEVICE),
+            )
+            for op, snr in reports.items():
+                if snr > 0.1:
+                    logger.warning(f"层 {op} 的累计量化误差显著，请考虑进行优化")
+            print("正计算逐层量化误差(SNR)，每一层的独立量化误差应小于 0.1 以保证量化精度:")
+            layerwise_error_analyse(
+                graph=quantized,
+                running_device=EXECUTING_DEVICE,
+                interested_outputs=None,
+                dataloader=calib_dataloader.dataloader_holder,
+                collate_fn=lambda x: x.to(EXECUTING_DEVICE),
+            )
+    except Exception as e:
+        logger.warning('analyse got some error, but that is OK, pass it.')
+
+
+    # EXPORT_TARGET = TargetPlatform.ORT_OOS_INT8
+    EXPORT_TARGET = TargetPlatform.TRT_INT8
+    # EXPORT_TARGET = TargetPlatform.TRT_INT8
+    os.makedirs('Output/', exist_ok=True)
+    # export quantized graph.
+    export_ppq_graph(
+        graph=quantized,
+        platform=EXPORT_TARGET,
+        graph_save_to=f"Output/quantized_{EXPORT_TARGET}.onnx",
+        config_save_to=f"Output/quantized_{EXPORT_TARGET}.json",
+    )
+
+    
\ No newline at end of file
diff --git a/deploy/quant_fx/.gitignore b/deploy/quant_fx/.gitignore
new file mode 100644
index 0000000..fc7f859
--- /dev/null
+++ b/deploy/quant_fx/.gitignore
@@ -0,0 +1,4 @@
+*.pt
+*.torchscript
+data/
+vendor/
diff --git a/deploy/quant_fx/calib.py b/deploy/quant_fx/calib.py
new file mode 100644
index 0000000..7f7a0e9
--- /dev/null
+++ b/deploy/quant_fx/calib.py
@@ -0,0 +1,85 @@
+import torch
+import os
+from pytorch_quantization import nn as quant_nn
+import tqdm
+from pytorch_quantization import calib
+
+
+def compute_amax(model, **kwargs):
+    # Load calib result
+    for name, module in model.named_modules():
+        if isinstance(module, quant_nn.TensorQuantizer):
+            if module._calibrator is not None:
+                if isinstance(module._calibrator, calib.MaxCalibrator):
+                    module.load_calib_amax()
+                else:
+                    module.load_calib_amax(**kwargs)
+            print(F"{name:40}: {module}")
+    model.cuda()
+
+def collect_stats(model, data_loader, num_batches):
+    """Feed data to the network and collect statistics"""
+    # Enable calibrators
+    for name, module in model.named_modules():
+        if isinstance(module, quant_nn.TensorQuantizer):
+            if module._calibrator is not None:
+                module.disable_quant()
+                module.enable_calib()
+            else:
+                module.disable()
+
+    # Feed data to the network for collecting stats
+    for i, (image, _) in tqdm(enumerate(data_loader), total=num_batches):
+        model(image.cuda())
+        if i >= num_batches:
+            break
+
+    # Disable calibrators
+    for name, module in model.named_modules():
+        if isinstance(module, quant_nn.TensorQuantizer):
+            if module._calibrator is not None:
+                module.enable_quant()
+                module.disable_calib()
+            else:
+                module.enable()
+
+def calibrate_model(model, model_name, data_loader, num_calib_batch, calibrator, hist_percentile, out_dir):
+    """
+        Feed data to the network and calibrate.
+        Arguments:
+            model: classification model
+            model_name: name to use when creating state files
+            data_loader: calibration data set
+            num_calib_batch: amount of calibration passes to perform
+            calibrator: type of calibration to use (max/histogram)
+            hist_percentile: percentiles to be used for historgram calibration
+            out_dir: dir to save state files in
+    """
+
+    if num_calib_batch > 0:
+        print("Calibrating model")
+        with torch.no_grad():
+            collect_stats(model, data_loader, num_calib_batch)
+
+        if not calibrator == "histogram":
+            compute_amax(model, method="max")
+            calib_output = os.path.join(
+                out_dir,
+                F"{model_name}-max-{num_calib_batch*data_loader.batch_size}.pth")
+            torch.save(model.state_dict(), calib_output)
+        else:
+            for percentile in hist_percentile:
+                print(F"{percentile} percentile calibration")
+                compute_amax(model, method="percentile")
+                calib_output = os.path.join(
+                    out_dir,
+                    F"{model_name}-percentile-{percentile}-{num_calib_batch*data_loader.batch_size}.pth")
+                torch.save(model.state_dict(), calib_output)
+
+            for method in ["mse", "entropy"]:
+                print(F"{method} calibration")
+                compute_amax(model, method=method)
+                calib_output = os.path.join(
+                    out_dir,
+                    F"{model_name}-{method}-{num_calib_batch*data_loader.batch_size}.pth")
+                torch.save(model.state_dict(), calib_output)
\ No newline at end of file
diff --git a/deploy/quant_fx/configs/r18.yaml b/deploy/quant_fx/configs/r18.yaml
new file mode 100644
index 0000000..59010b5
--- /dev/null
+++ b/deploy/quant_fx/configs/r18.yaml
@@ -0,0 +1,42 @@
+extra_prepare_dict:
+    extra_qconfig_dict:
+        w_observer: MinMaxObserver
+        a_observer: EMAMinMaxObserver
+        w_fakequantize: LearnableFakeQuantize
+        a_fakequantize: LearnableFakeQuantize
+        w_qscheme:
+            bit: 8
+            # symmetry: False
+            symmetry: true
+            per_channel: True
+            pot_scale: False
+        a_qscheme:
+            bit: 8
+            # symmetry: False
+            symmetry: true
+            per_channel: False
+            pot_scale: False
+quantize:
+    quantize_type: naive_ptq # support naive_ptq or advanced_ptq
+    cali_batchsize: 16
+    # backend: 'Tensorrt'
+    backend: 'ONNX_QNN'
+    # backend: 'PPLW8A16'
+    deploy:
+        model_name: 'r18.onnx'
+        output_path: './'
+        deploy_to_qlinear: true
+model:                    # architecture details
+    type: resnet18        # model name
+    kwargs:
+        num_classes: 1000
+    path: /path-of-pretrained
+data:
+    path: /path-of-imagenet
+    batch_size: 64
+    num_workers: 4
+    pin_memory: True
+    input_size: 224
+    test_resize: 256
+process:
+    seed: 1005
\ No newline at end of file
diff --git a/deploy/quant_fx/fx_ptq_test.py b/deploy/quant_fx/fx_ptq_test.py
new file mode 100644
index 0000000..9b52b59
--- /dev/null
+++ b/deploy/quant_fx/fx_ptq_test.py
@@ -0,0 +1,296 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import copy
+import torchvision
+from torchvision import transforms
+from torchvision.models.resnet import resnet50, resnet18
+from torch.quantization.quantize_fx import prepare_fx, convert_fx
+from torch.ao.quantization.fx.graph_module import ObservedGraphModule
+from torch.quantization import (
+    get_default_qconfig,
+)
+from torch import optim
+from torch.onnx import OperatorExportTypes
+import os
+import time
+
+
+def train_model(model, train_loader, test_loader, device):
+    # The training configurations were not carefully selected.
+    learning_rate = 1e-2
+    num_epochs = 20
+    criterion = nn.CrossEntropyLoss()
+    model.to(device)
+    # It seems that SGD optimizer is better than Adam optimizer for ResNet18 training on CIFAR10.
+    optimizer = optim.SGD(
+        model.parameters(), lr=learning_rate, momentum=0.9, weight_decay=1e-5
+    )
+    # optimizer = optim.Adam(model.parameters(), lr=learning_rate, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
+    for epoch in range(num_epochs):
+        # Training
+        model.train()
+
+        running_loss = 0
+        running_corrects = 0
+
+        for inputs, labels in train_loader:
+            inputs = inputs.to(device)
+            labels = labels.to(device)
+
+            # zero the parameter gradients
+            optimizer.zero_grad()
+
+            # forward + backward + optimize
+            outputs = model(inputs)
+            _, preds = torch.max(outputs, 1)
+            loss = criterion(outputs, labels)
+            loss.backward()
+            optimizer.step()
+
+            # statistics
+            running_loss += loss.item() * inputs.size(0)
+            running_corrects += torch.sum(preds == labels.data)
+
+        train_loss = running_loss / len(train_loader.dataset)
+        train_accuracy = running_corrects / len(train_loader.dataset)
+
+        # Evaluation
+        model.eval()
+        eval_loss, eval_accuracy = evaluate_model(
+            model=model, test_loader=test_loader, device=device, criterion=criterion
+        )
+        print(
+            "Epoch: {:02d} Train Loss: {:.3f} Train Acc: {:.3f} Eval Loss: {:.3f} Eval Acc: {:.3f}".format(
+                epoch, train_loss, train_accuracy, eval_loss, eval_accuracy
+            )
+        )
+    return model
+
+
+def prepare_dataloader(num_workers=8, train_batch_size=128, eval_batch_size=256):
+    train_transform = transforms.Compose(
+        [
+            transforms.RandomCrop(32, padding=4),
+            transforms.RandomHorizontalFlip(),
+            transforms.ToTensor(),
+            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+        ]
+    )
+    test_transform = transforms.Compose(
+        [
+            transforms.ToTensor(),
+            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+        ]
+    )
+    train_set = torchvision.datasets.CIFAR10(
+        root="data", train=True, download=True, transform=train_transform
+    )
+    # We will use test set for validation and test in this project.
+    # Do not use test set for validation in practice!
+    test_set = torchvision.datasets.CIFAR10(
+        root="data", train=False, download=True, transform=test_transform
+    )
+    train_sampler = torch.utils.data.RandomSampler(train_set)
+    test_sampler = torch.utils.data.SequentialSampler(test_set)
+
+    train_loader = torch.utils.data.DataLoader(
+        dataset=train_set,
+        batch_size=train_batch_size,
+        sampler=train_sampler,
+        num_workers=num_workers,
+    )
+    test_loader = torch.utils.data.DataLoader(
+        dataset=test_set,
+        batch_size=eval_batch_size,
+        sampler=test_sampler,
+        num_workers=num_workers,
+    )
+    return train_loader, test_loader
+
+
+def evaluate_model(model, test_loader, device=torch.device("cpu"), criterion=None):
+    t0 = time.time()
+    model.eval()
+    model.to(device)
+    running_loss = 0
+    running_corrects = 0
+    for inputs, labels in test_loader:
+
+        inputs = inputs.to(device)
+        labels = labels.to(device)
+        outputs = model(inputs)
+        _, preds = torch.max(outputs, 1)
+
+        if criterion is not None:
+            loss = criterion(outputs, labels).item()
+        else:
+            loss = 0
+
+        # statistics
+        running_loss += loss * inputs.size(0)
+        running_corrects += torch.sum(preds == labels.data)
+
+    eval_loss = running_loss / len(test_loader.dataset)
+    eval_accuracy = running_corrects / len(test_loader.dataset)
+    t1 = time.time()
+    print(f"eval loss: {eval_loss}, eval acc: {eval_accuracy}, cost: {t1 - t0}")
+    return eval_loss, eval_accuracy
+
+
+def get_output_from_logits(logits):
+    probs = F.softmax(logits)
+    label, prob = torch.max(probs, dim=-1)
+    print(label, prob)
+    return
+
+
+def calib_quant_model(model, calib_dataloader):
+    assert isinstance(
+        model, ObservedGraphModule
+    ), "model must be a perpared fx ObservedGraphModule."
+    model.eval()
+    with torch.inference_mode():
+        for inputs, labels in calib_dataloader:
+            model(inputs)
+    print("calib done.")
+
+
+def quant_fx(model):
+    model.eval()
+    qconfig = get_default_qconfig("fbgemm")
+    qconfig_dict = {
+        "": qconfig,
+        # 'object_type': []
+    }
+    model_to_quantize = copy.deepcopy(model)
+    prepared_model = prepare_fx(model_to_quantize, qconfig_dict)
+    print("prepared model: ", prepared_model)
+
+    quantized_model = convert_fx(prepared_model)
+    print("quantized model: ", quantized_model)
+    torch.save(model.state_dict(), "r18.pth")
+    torch.save(quantized_model.state_dict(), "r18_quant.pth")
+
+
+def quant_calib_and_eval(model):
+    # test only on CPU
+    model.to(torch.device("cpu"))
+    model.eval()
+
+    qconfig = get_default_qconfig("fbgemm")
+    qconfig_dict = {
+        "": qconfig,
+        # 'object_type': []
+    }
+
+    model2 = copy.deepcopy(model)
+    model_prepared = prepare_fx(model2, qconfig_dict)
+    model_int8 = convert_fx(model_prepared)
+    model_int8.load_state_dict(torch.load("r18_quant.pth"))
+    model_int8.eval()
+
+    a = torch.randn([1, 3, 224, 224])
+    o1 = model(a)
+    o2 = model_int8(a)
+
+    diff = torch.allclose(o1, o2, 1e-4)
+    print(diff)
+    print(o1.shape, o2.shape)
+    print(o1, o2)
+    get_output_from_logits(o1)
+    get_output_from_logits(o2)
+
+    train_loader, test_loader = prepare_dataloader()
+    evaluate_model(model, test_loader)
+    evaluate_model(model_int8, test_loader)
+
+    # calib quant model
+    model2 = copy.deepcopy(model)
+    model_prepared = prepare_fx(model2, qconfig_dict)
+    model_int8 = convert_fx(model_prepared)
+    torch.save(model_int8.state_dict(), "r18.pth")
+    model_int8.eval()
+
+    model_prepared = prepare_fx(model2, qconfig_dict)
+    calib_quant_model(model_prepared, test_loader)
+    model_int8 = convert_fx(model_prepared)
+    torch.save(model_int8.state_dict(), "r18_quant_calib.pth")
+    evaluate_model(model_int8, test_loader)
+
+
+def export_quant_onnx(model):
+    model.to(torch.device("cpu"))
+    model.eval()
+
+    qconfig = get_default_qconfig("fbgemm")
+    qconfig_dict = {
+        "": qconfig,
+        # 'object_type': []
+    }
+
+    model2 = copy.deepcopy(model)
+    model_prepared = prepare_fx(model2, qconfig_dict)
+    model_int8 = convert_fx(model_prepared)
+    model_int8.load_state_dict(torch.load("r18_quant_calib.pth"))
+    model_int8.eval()
+
+    a = torch.randn([1, 3, 224, 224])
+    torch.onnx.export(model_int8, a, "r18_int8.onnx")
+    print("int8 onnx saved.")
+
+
+def export_quant_torchscript(model):
+    model.to(torch.device("cpu"))
+    model.eval()
+
+    qconfig = get_default_qconfig("fbgemm")
+    qconfig_dict = {
+        "": qconfig,
+        # 'object_type': []
+    }
+    print(qconfig_dict)
+    with torch.no_grad():
+        model2 = copy.deepcopy(model)
+        model_prepared = prepare_fx(model2, qconfig_dict)
+        model_int8 = convert_fx(model_prepared)
+        model_int8.load_state_dict(torch.load("r18_quant_calib.pth"))
+        model_int8.eval()
+
+        a = torch.randn([1, 3, 224, 224])
+        # torch.jit.save.export(model_int8, a, "r18_int8.onnx")
+        sm = torch.jit.trace(model_int8, a)
+        sm.save("r18_int8.torchscript")
+
+        dm = torch.jit.load("r18_int8.torchscript")
+
+        print(model_int8)
+        torch.onnx.export(
+            model_int8,
+            a,
+            "r18_int8.onnx",
+            opset_version=13,
+            operator_export_type=OperatorExportTypes.ONNX_ATEN_FALLBACK,
+        )
+        print("int8 onnx saved.")
+        evaluate_model(dm, test_loader)
+
+
+if __name__ == "__main__":
+    train_loader, test_loader = prepare_dataloader()
+
+    # first finetune model on cifar, we don't have imagnet so using cifar as test
+    model = resnet18(pretrained=True)
+    model.fc = nn.Linear(512, 10)
+    if os.path.exists("r18_raw.pth"):
+        model.load_state_dict(torch.load("r18_raw.pth", map_location="cpu"))
+    else:
+        from alfred.dl.torch.common import device
+        train_model(model, train_loader, test_loader, device)
+        print("train finished.")
+        torch.save(model.state_dict(), "r18_raw.pth")
+
+    with torch.no_grad():
+        quant_fx(model)
+        quant_calib_and_eval(model)
+        export_quant_torchscript(model)
diff --git a/deploy/quant_fx/qt_mq_test.py b/deploy/quant_fx/qt_mq_test.py
new file mode 100644
index 0000000..55187e8
--- /dev/null
+++ b/deploy/quant_fx/qt_mq_test.py
@@ -0,0 +1,218 @@
+
+from statistics import mode
+import numpy as np
+import argparse
+from torchvision.models.resnet import resnet50, resnet18
+import torch.nn as nn
+import os
+import time
+from easydict import EasyDict
+import yaml
+import sys
+from alfred.dl.torch.common import device
+from alfred.utils.log import logger
+from atomquant.atom.prepare_by_platform import prepare_by_platform, BackendType
+from atomquant.atom.convert_deploy import convert_deploy
+from torchvision import transforms
+import torchvision
+import torch
+
+backend_dict = {
+    'Academic': BackendType.Academic,
+    'Tensorrt': BackendType.Tensorrt,
+    'SNPE': BackendType.SNPE,
+    'PPLW8A16': BackendType.PPLW8A16,
+    'NNIE': BackendType.NNIE,
+    'Vitis': BackendType.Vitis,
+    'ONNX_QNN': BackendType.ONNX_QNN,
+    'PPLCUDA': BackendType.PPLCUDA,
+}
+
+
+def parse_config(config_file):
+    with open(config_file) as f:
+        config = yaml.load(f, Loader=yaml.FullLoader)
+        cur_config = config
+        cur_path = config_file
+        while 'root' in cur_config:
+            root_path = os.path.dirname(cur_path)
+            cur_path = os.path.join(root_path, cur_config['root'])
+            with open(cur_path) as r:
+                root_config = yaml.load(r, Loader=yaml.FullLoader)
+                for k, v in root_config.items():
+                    if k not in config:
+                        config[k] = v
+                cur_config = root_config
+        # config = yaml.safe_load(f)
+    config = EasyDict(config)
+    return config
+
+
+def load_calibrate_data(train_loader, cali_batchsize):
+    cali_data = []
+    for i, batch in enumerate(train_loader):
+        cali_data.append(batch[0])
+        if i + 1 == cali_batchsize:
+            break
+    return cali_data
+
+
+def get_quantize_model(model, config):
+    backend_type = BackendType.Academic if not hasattr(
+        config.quantize, 'backend') else backend_dict[config.quantize.backend]
+    extra_prepare_dict = {} if not hasattr(
+        config, 'extra_prepare_dict') else config.extra_prepare_dict
+    return prepare_by_platform(
+        model, backend_type, extra_prepare_dict)
+
+
+def deploy(model, config):
+    backend_type = BackendType.Academic if not hasattr(
+        config.quantize, 'backend') else backend_dict[config.quantize.backend]
+    output_path = './' if not hasattr(
+        config.quantize, 'deploy') else config.quantize.deploy.output_path
+    model_name = config.quantize.deploy.model_name
+    deploy_to_qlinear = False if not hasattr(
+        config.quantize.deploy, 'deploy_to_qlinear') else config.quantize.deploy.deploy_to_qlinear
+
+    convert_deploy(model, backend_type, {
+                   'input': [1, 3, 224, 224]}, output_path=output_path, model_name=model_name, deploy_to_qlinear=deploy_to_qlinear)
+
+
+def prepare_dataloader(num_workers=8, train_batch_size=128, eval_batch_size=256):
+    train_transform = transforms.Compose(
+        [
+            transforms.RandomCrop(32, padding=4),
+            transforms.RandomHorizontalFlip(),
+            transforms.ToTensor(),
+            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+        ]
+    )
+    test_transform = transforms.Compose(
+        [
+            transforms.ToTensor(),
+            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+        ]
+    )
+    train_set = torchvision.datasets.CIFAR10(
+        root="data", train=True, download=True, transform=train_transform
+    )
+    # We will use test set for validation and test in this project.
+    # Do not use test set for validation in practice!
+    test_set = torchvision.datasets.CIFAR10(
+        root="data", train=False, download=True, transform=test_transform
+    )
+    train_sampler = torch.utils.data.RandomSampler(train_set)
+    test_sampler = torch.utils.data.SequentialSampler(test_set)
+
+    train_loader = torch.utils.data.DataLoader(
+        dataset=train_set,
+        batch_size=train_batch_size,
+        sampler=train_sampler,
+        num_workers=num_workers,
+    )
+    test_loader = torch.utils.data.DataLoader(
+        dataset=test_set,
+        batch_size=eval_batch_size,
+        sampler=test_sampler,
+        num_workers=num_workers,
+    )
+    return train_loader, test_loader
+
+
+def evaluate_model(model, test_loader, criterion=None):
+    t0 = time.time()
+    model.eval()
+    model.to(device)
+    running_loss = 0
+    running_corrects = 0
+    for inputs, labels in test_loader:
+
+        inputs = inputs.to(device)
+        labels = labels.to(device)
+        outputs = model(inputs)
+        _, preds = torch.max(outputs, 1)
+
+        if criterion is not None:
+            loss = criterion(outputs, labels).item()
+        else:
+            loss = 0
+
+        # statistics
+        running_loss += loss * inputs.size(0)
+        running_corrects += torch.sum(preds == labels.data)
+
+    eval_loss = running_loss / len(test_loader.dataset)
+    eval_accuracy = running_corrects / len(test_loader.dataset)
+    t1 = time.time()
+    print(f"eval loss: {eval_loss}, eval acc: {eval_accuracy}, cost: {t1 - t0}")
+    return eval_loss, eval_accuracy
+
+
+if __name__ == '__main__':
+    train_loader, test_loader = prepare_dataloader()
+
+    config_f = sys.argv[1]
+    config = parse_config(config_f)
+    print(config)
+    # first finetune model on cifar, we don't have imagnet so using cifar as test
+    model = resnet18(pretrained=True)
+    model.fc = nn.Linear(512, 10)
+    if os.path.exists("r18_raw.pth"):
+        model.load_state_dict(torch.load("r18_raw.pth", map_location="cpu"))
+    else:
+        # train_model(model, train_loader, test_loader, device)
+        print("train finished.")
+        # torch.save(model.state_dict(), "r18_raw.pth")
+    model.to(device)
+    model.eval()
+
+    if hasattr(config, 'quantize'):
+        model = get_quantize_model(model, config)
+        print('now model in quantized mode.')
+    
+    model.to(device)
+    evaluate_model(model, test_loader)
+
+    # evaluate
+    if not hasattr(config, 'quantize'):
+        evaluate_model(model, test_loader)
+    elif config.quantize.quantize_type == 'advanced_ptq':
+        print('begin calibration now!')
+        cali_data = load_calibrate_data(test_loader, cali_batchsize=config.quantize.cali_batchsize)
+        from mqbench.utils.state import enable_quantization, enable_calibration_woquantization
+        # do activation and weight calibration seperately for quick MSE per-channel for weight one
+        model.eval()
+        enable_calibration_woquantization(model, quantizer_type='act_fake_quant')
+        for batch in cali_data:
+            model(batch.cuda())
+        enable_calibration_woquantization(model, quantizer_type='weight_fake_quant')
+        model(cali_data[0].cuda())
+        print('begin advanced PTQ now!')
+        if hasattr(config.quantize, 'reconstruction'):
+            model = ptq_reconstruction(
+                model, cali_data, config.quantize.reconstruction)
+        enable_quantization(model)
+        evaluate_model(model, test_loader)
+        if hasattr(config.quantize, 'deploy'):
+            deploy(model, config)
+    elif config.quantize.quantize_type == 'naive_ptq':
+        print('begin calibration now!')
+        cali_data = load_calibrate_data(test_loader, cali_batchsize=config.quantize.cali_batchsize)
+        from atomquant.atom.utils.state import enable_quantization, enable_calibration_woquantization
+        # do activation and weight calibration seperately for quick MSE per-channel for weight one
+        model.eval()
+        enable_calibration_woquantization(model, quantizer_type='act_fake_quant')
+        for batch in cali_data:
+            model(batch.to(device))
+        enable_calibration_woquantization(model, quantizer_type='weight_fake_quant')
+        model(cali_data[0].to(device))
+        print('begin quantization now!')
+        enable_quantization(model)
+        # print(model)
+        evaluate_model(model, test_loader)
+        if hasattr(config.quantize, 'deploy'):
+            deploy(model, config)
+    else:
+        print("The quantize_type must in 'naive_ptq' or 'advanced_ptq',")
+        print("and 'advanced_ptq' need reconstruction configration.")
\ No newline at end of file
diff --git a/deploy/quant_fx/qt_q_test.py b/deploy/quant_fx/qt_q_test.py
new file mode 100644
index 0000000..6212746
--- /dev/null
+++ b/deploy/quant_fx/qt_q_test.py
@@ -0,0 +1,30 @@
+import torch
+from torch import quantization
+from torchvision import models
+
+qat_resnet18 = models.resnet18(pretrained=True).cuda()
+
+qat_resnet18.qconfig = quantization.QConfig(
+    activation=quantization.default_fake_quant,
+    weight=quantization.default_per_channel_weight_fake_quant,
+)
+quantization.prepare_qat(qat_resnet18, inplace=True)
+qat_resnet18.apply(quantization.enable_observer)
+qat_resnet18.apply(quantization.enable_fake_quant)
+
+dummy_input = torch.randn(16, 3, 224, 224).cuda()
+_ = qat_resnet18(dummy_input)
+for module in qat_resnet18.modules():
+    if isinstance(module, quantization.FakeQuantize):
+        module.calculate_qparams()
+qat_resnet18.apply(quantization.disable_observer)
+
+qat_resnet18.cuda()
+
+input_names = ["actual_input_1"]
+output_names = ["output1"]
+
+
+torch.onnx.export(
+    qat_resnet18, dummy_input, "quant_model.onnx", verbose=True, opset_version=13
+)
diff --git a/deploy/quant_fx/quant_ptq_test.py b/deploy/quant_fx/quant_ptq_test.py
new file mode 100644
index 0000000..aa0bd0e
--- /dev/null
+++ b/deploy/quant_fx/quant_ptq_test.py
@@ -0,0 +1,232 @@
+from bitarray import test
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import copy
+import torchvision
+from torchvision import transforms
+from torchvision.models.resnet import resnet50, resnet18
+from torch.quantization.quantize_fx import prepare_fx, convert_fx
+from torch.ao.quantization.fx.graph_module import ObservedGraphModule
+from torch.quantization import (
+    default_dynamic_qconfig,
+    float_qparams_weight_only_qconfig,
+    get_default_qconfig,
+)
+from torch import optim
+import os
+
+
+def train_model(model, train_loader, test_loader, device):
+    # The training configurations were not carefully selected.
+    learning_rate = 1e-2
+    num_epochs = 20
+    criterion = nn.CrossEntropyLoss()
+    model.to(device)
+    # It seems that SGD optimizer is better than Adam optimizer for ResNet18 training on CIFAR10.
+    optimizer = optim.SGD(
+        model.parameters(), lr=learning_rate, momentum=0.9, weight_decay=1e-5
+    )
+    # optimizer = optim.Adam(model.parameters(), lr=learning_rate, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
+    for epoch in range(num_epochs):
+        # Training
+        model.train()
+
+        running_loss = 0
+        running_corrects = 0
+
+        for inputs, labels in train_loader:
+            inputs = inputs.to(device)
+            labels = labels.to(device)
+
+            # zero the parameter gradients
+            optimizer.zero_grad()
+
+            # forward + backward + optimize
+            outputs = model(inputs)
+            _, preds = torch.max(outputs, 1)
+            loss = criterion(outputs, labels)
+            loss.backward()
+            optimizer.step()
+
+            # statistics
+            running_loss += loss.item() * inputs.size(0)
+            running_corrects += torch.sum(preds == labels.data)
+
+        train_loss = running_loss / len(train_loader.dataset)
+        train_accuracy = running_corrects / len(train_loader.dataset)
+
+        # Evaluation
+        model.eval()
+        eval_loss, eval_accuracy = evaluate_model(
+            model=model, test_loader=test_loader, device=device, criterion=criterion
+        )
+        print(
+            "Epoch: {:02d} Train Loss: {:.3f} Train Acc: {:.3f} Eval Loss: {:.3f} Eval Acc: {:.3f}".format(
+                epoch, train_loss, train_accuracy, eval_loss, eval_accuracy
+            )
+        )
+    return model
+
+
+def prepare_dataloader(num_workers=8, train_batch_size=128, eval_batch_size=256):
+    train_transform = transforms.Compose(
+        [
+            transforms.RandomCrop(32, padding=4),
+            transforms.RandomHorizontalFlip(),
+            transforms.ToTensor(),
+            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+        ]
+    )
+    test_transform = transforms.Compose(
+        [
+            transforms.ToTensor(),
+            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+        ]
+    )
+    train_set = torchvision.datasets.CIFAR10(
+        root="data", train=True, download=True, transform=train_transform
+    )
+    # We will use test set for validation and test in this project.
+    # Do not use test set for validation in practice!
+    test_set = torchvision.datasets.CIFAR10(
+        root="data", train=False, download=True, transform=test_transform
+    )
+    train_sampler = torch.utils.data.RandomSampler(train_set)
+    test_sampler = torch.utils.data.SequentialSampler(test_set)
+
+    train_loader = torch.utils.data.DataLoader(
+        dataset=train_set,
+        batch_size=train_batch_size,
+        sampler=train_sampler,
+        num_workers=num_workers,
+    )
+    test_loader = torch.utils.data.DataLoader(
+        dataset=test_set,
+        batch_size=eval_batch_size,
+        sampler=test_sampler,
+        num_workers=num_workers,
+    )
+    return train_loader, test_loader
+
+
+def evaluate_model(model, test_loader, device=torch.device("cpu"), criterion=None):
+    model.eval()
+    model.to(device)
+    running_loss = 0
+    running_corrects = 0
+    for inputs, labels in test_loader:
+
+        inputs = inputs.to(device)
+        labels = labels.to(device)
+        outputs = model(inputs)
+        _, preds = torch.max(outputs, 1)
+
+        if criterion is not None:
+            loss = criterion(outputs, labels).item()
+        else:
+            loss = 0
+
+        # statistics
+        running_loss += loss * inputs.size(0)
+        running_corrects += torch.sum(preds == labels.data)
+
+    eval_loss = running_loss / len(test_loader.dataset)
+    eval_accuracy = running_corrects / len(test_loader.dataset)
+    print(f"eval loss: {eval_loss}, eval acc: {eval_accuracy}")
+    return eval_loss, eval_accuracy
+
+
+def get_output_from_logits(logits):
+    probs = F.softmax(logits)
+    label, prob = torch.max(probs, dim=-1)
+    print(label, prob)
+    return
+
+
+def calib_quant_model(model, calib_dataloader):
+    # assert isinstance(model, ObservedGraphModule), 'model must be a perpared fx ObservedGraphModule.'
+    model.eval()
+    with torch.inference_mode():
+        for inputs, labels in calib_dataloader:
+            model(inputs)
+    print("calib done.")
+
+
+def quant_fx(model):
+    model.eval()
+    qconfig = get_default_qconfig("fbgemm")
+    qconfig_dict = {
+        "": qconfig,
+        # 'object_type': []
+    }
+    model_to_quantize = copy.deepcopy(model)
+    prepared_model = prepare_fx(model_to_quantize, qconfig_dict)
+    print("prepared model: ", prepared_model)
+
+    quantized_model = convert_fx(prepared_model)
+    print("quantized model: ", quantized_model)
+    torch.save(model.state_dict(), "r18.pth")
+    torch.save(quantized_model.state_dict(), "r18_quant.pth")
+
+
+def quant2(model):
+    # test only on CPU
+    model.to(torch.device("cpu"))
+    model.eval()
+
+    qconfig = get_default_qconfig("fbgemm")
+    qconfig_dict = {
+        "": qconfig,
+        # 'object_type': []
+    }
+
+    model2 = copy.deepcopy(model)
+    model_prepared = torch.quantization.prepare(model2, qconfig_dict)
+    model_int8 = torch.quantization.convert(model_prepared, inplace=True)
+    model_int8.load_state_dict(torch.load("r18_quant.pth"))
+    model_int8.eval()
+
+    a = torch.randn([1, 3, 224, 224])
+    o1 = model(a)
+    o2 = model_int8(a)
+
+    diff = torch.allclose(o1, o2, 1e-4)
+    print(diff)
+    print(o1.shape, o2.shape)
+    print(o1, o2)
+    get_output_from_logits(o1)
+    get_output_from_logits(o2)
+
+    train_loader, test_loader = prepare_dataloader()
+    evaluate_model(model, test_loader)
+    evaluate_model(model_int8, test_loader)
+
+    # calib quant model
+    model2 = copy.deepcopy(model)
+    model_prepared = torch.quantization.prepare(model2, qconfig_dict)
+    calib_quant_model(model_prepared, test_loader)
+    model_int8 = torch.quantization.convert(model_prepared, inplace=True)
+    torch.save(model_int8.state_dict(), "r18_quant_calib.pth")
+    model_int8.eval()
+
+    calib_quant_model(model, test_loader)
+    evaluate_model(model_int8, test_loader)
+
+
+if __name__ == "__main__":
+    train_loader, test_loader = prepare_dataloader()
+
+    # first finetune model on cifar, we don't have imagnet so using cifar as test
+    model = resnet18(pretrained=True)
+    model.fc = nn.Linear(512, 10)
+    if os.path.exists("r18_row.pth"):
+        model.load_state_dict(torch.load("r18_row.pth", map_location="cpu"))
+    else:
+        train_model(model, train_loader, test_loader, torch.device("cuda"))
+        print("train finished.")
+        torch.save(model.state_dict(), "r18_row.pth")
+
+    with torch.no_grad():
+        quant_fx(model)
+        quant2(model)
diff --git a/deploy/quant_fx/r18.onnx_clip_ranges.json b/deploy/quant_fx/r18.onnx_clip_ranges.json
new file mode 100644
index 0000000..51c530d
--- /dev/null
+++ b/deploy/quant_fx/r18.onnx_clip_ranges.json
@@ -0,0 +1,38 @@
+{
+    "tensorrt": {
+        "blob_range": {
+            "input": 2.7645304203033447,
+            "::FixedPerTensorAffine_419": 5.25675630569458,
+            "::FixedPerTensorAffine_425": 5.23717737197876,
+            "::FixedPerTensorAffine_438": 2.820558786392212,
+            "::FixedPerTensorAffine_450": 3.860928535461426,
+            "::FixedPerTensorAffine_457": 5.737135887145996,
+            "::FixedPerTensorAffine_470": 2.6348819732666016,
+            "::FixedPerTensorAffine_482": 6.245926380157471,
+            "::FixedPerTensorAffine_489": 6.591638088226318,
+            "::FixedPerTensorAffine_502": 2.662112236022949,
+            "::FixedPerTensorAffine_514": 3.563220500946045,
+            "::FixedPerTensorAffine_526": 3.338449239730835,
+            "::FixedPerTensorAffine_533": 3.929392099380493,
+            "::FixedPerTensorAffine_546": 2.284010887145996,
+            "::FixedPerTensorAffine_558": 3.760089635848999,
+            "::FixedPerTensorAffine_565": 5.2349677085876465,
+            "::FixedPerTensorAffine_578": 2.779576539993286,
+            "::FixedPerTensorAffine_590": 3.2088496685028076,
+            "::FixedPerTensorAffine_602": 1.4625221490859985,
+            "::FixedPerTensorAffine_609": 3.332473039627075,
+            "::FixedPerTensorAffine_622": 2.045714855194092,
+            "::FixedPerTensorAffine_634": 4.317813396453857,
+            "::FixedPerTensorAffine_641": 5.273874759674072,
+            "::FixedPerTensorAffine_654": 3.230210542678833,
+            "::FixedPerTensorAffine_666": 13.654618263244629,
+            "::FixedPerTensorAffine_678": 2.2387030124664307,
+            "::FixedPerTensorAffine_685": 12.90011215209961,
+            "::FixedPerTensorAffine_698": 9.551894187927246,
+            "::FixedPerTensorAffine_710": 47.117374420166016,
+            "::FixedPerTensorAffine_717": 41.599098205566406,
+            "::FixedPerTensorAffine_724": 40.52422332763672,
+            "onnx::Flatten_723": 40.52422332763672
+        }
+    }
+}
\ No newline at end of file
diff --git a/deploy/quant_fx/test.py b/deploy/quant_fx/test.py
new file mode 100644
index 0000000..488a15e
--- /dev/null
+++ b/deploy/quant_fx/test.py
@@ -0,0 +1,5 @@
+from alfred.utils.log import logger
+
+logger.info('this ia info')
+logger.warning('this ia info')
+logger.error('this ia info')
\ No newline at end of file
diff --git a/deploy/quant_onnx/.gitignore b/deploy/quant_onnx/.gitignore
new file mode 100644
index 0000000..462b358
--- /dev/null
+++ b/deploy/quant_onnx/.gitignore
@@ -0,0 +1,2 @@
+data/
+vendor/
diff --git a/deploy/quant_onnx/prepare_onnx.py b/deploy/quant_onnx/prepare_onnx.py
new file mode 100644
index 0000000..f0597d2
--- /dev/null
+++ b/deploy/quant_onnx/prepare_onnx.py
@@ -0,0 +1,24 @@
+from torchvision.models.resnet import resnet18
+from torch import nn
+import os
+import torch
+
+
+model = resnet18(pretrained=True)
+model.fc = nn.Linear(512, 10)
+if os.path.exists("r18_raw.pth"):
+    model.load_state_dict(torch.load("r18_raw.pth", map_location="cpu"))
+else:
+    pass
+
+model.eval()
+
+a = torch.randn([1, 3, 224, 224])
+torch.onnx.export(
+    model,
+    a,
+    "r18.onnx",
+    input_names=["data"],
+    dynamic_axes={"data": {0: "batch", 2: "h", 3: "w"}},
+    opset_version=13
+)
diff --git a/deploy/quant_onnx/qt_atom_pose.py b/deploy/quant_onnx/qt_atom_pose.py
new file mode 100644
index 0000000..9521f34
--- /dev/null
+++ b/deploy/quant_onnx/qt_atom_pose.py
@@ -0,0 +1,41 @@
+"""
+Using atomquant to quant SparseInst model
+"""
+from atomquant.onnx.ptq_cpu import quantize_static_onnx
+from atomquant.onnx.dataloader import get_calib_dataloader_coco
+from torchvision import transforms
+import cv2
+import numpy as np
+import sys
+import os
+import onnxruntime as ort
+
+
+def preprocess_func(img, target):
+    w = 192
+    h = 256
+    a = cv2.resize(img, (w, h))
+    a_t = np.array(a).astype(np.float32)
+    boxes = []
+    for t in target:
+        boxes.append(t["bbox"])
+    target = np.array(boxes)
+    return a_t, target
+
+
+def pqt(onnx_f):
+    coco_root = os.path.expanduser("~/data/coco/images/val2017")
+    anno_f = os.path.expanduser("~/data/coco/annotations/instances_val2017_val_val_train.json")
+
+    session = ort.InferenceSession(onnx_f)
+    input_name = session.get_inputs()[0].name
+
+    calib_dataloader = get_calib_dataloader_coco(
+        coco_root, anno_f, preprocess_func=preprocess_func, input_names=input_name, bs=1, max_step=50
+    )
+    quantize_static_onnx(onnx_f, calib_dataloader=calib_dataloader)
+
+
+if __name__ == "__main__":
+    onnx_f = sys.argv[1]
+    pqt(onnx_f)
diff --git a/deploy/quant_onnx/qt_atom_r18.py b/deploy/quant_onnx/qt_atom_r18.py
new file mode 100644
index 0000000..1dd7e22
--- /dev/null
+++ b/deploy/quant_onnx/qt_atom_r18.py
@@ -0,0 +1,89 @@
+"""
+Using atomquant to quant SparseInst model
+"""
+from atomquant.onnx.ptq_cpu import quantize_static_onnx
+from atomquant.onnx.dataloader import (
+    get_calib_dataloader_from_dataset,
+)
+from torchvision import transforms
+import cv2
+import numpy as np
+import sys
+import os
+import onnxruntime as ort
+import torchvision
+import time
+
+
+def evaluate_onnx_model(model_p, test_loader, criterion=None):
+    running_loss = 0
+    running_corrects = 0
+
+    session = ort.InferenceSession(model_p)
+    input_name = session.get_inputs()[0].name
+
+    total = 0.0
+    for inputs, labels in test_loader:
+        inputs = inputs.cpu().numpy()
+        labels = labels.cpu().numpy()
+
+        start = time.perf_counter()
+        outputs = session.run([], {input_name: inputs})
+        end = (time.perf_counter() - start) * 1000
+        total += end
+
+        outputs = outputs[0]
+        preds = np.argmax(outputs, 1)
+        if criterion is not None:
+            loss = criterion(outputs, labels).item()
+        else:
+            loss = 0
+        # statistics
+        running_corrects += np.sum(preds == labels)
+
+    # eval_loss = running_loss / len(test_loader.dataset)
+    eval_accuracy = running_corrects / len(test_loader.dataset)
+    total /= len(test_loader)
+    print(f"eval loss: {0}, eval acc: {eval_accuracy}, cost: {total}")
+    return 0, eval_accuracy
+
+
+if __name__ == "__main__":
+    model_p = sys.argv[1]
+    model_qp = os.path.join(
+        os.path.dirname(model_p),
+        os.path.basename(model_p).replace(".onnx", "_int8.onnx"),
+    )
+
+    train_transform = transforms.Compose(
+        [
+            transforms.RandomCrop(32, padding=4),
+            transforms.RandomHorizontalFlip(),
+            transforms.ToTensor(),
+            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+        ]
+    )
+    test_transform = transforms.Compose(
+        [
+            # transforms.RandomCrop(224, padding=4),
+            transforms.ToTensor(),
+            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+        ]
+    )
+    train_set = torchvision.datasets.CIFAR10(
+        root="data", train=True, download=True, transform=train_transform
+    )
+    test_set = torchvision.datasets.CIFAR10(
+        root="data", train=False, download=True, transform=test_transform
+    )
+
+    session = ort.InferenceSession(model_p)
+    input_name = session.get_inputs()[0].name
+
+    calib_dataloader = get_calib_dataloader_from_dataset(
+        test_set, input_names=input_name, bs=1, max_step=100
+    )
+    quantize_static_onnx(model_p, calib_dataloader=calib_dataloader)
+
+    evaluate_onnx_model(model_qp, calib_dataloader.dataloader_holder)
+    evaluate_onnx_model(model_p, calib_dataloader.dataloader_holder)
diff --git a/deploy/quant_onnx/qt_atom_sparseinst.py b/deploy/quant_onnx/qt_atom_sparseinst.py
new file mode 100644
index 0000000..427ed96
--- /dev/null
+++ b/deploy/quant_onnx/qt_atom_sparseinst.py
@@ -0,0 +1,41 @@
+"""
+Using atomquant to quant SparseInst model
+"""
+from atomquant.onnx.ptq_cpu import quantize_static_onnx
+from atomquant.onnx.dataloader import get_calib_dataloader_coco
+from torchvision import transforms
+import cv2
+import numpy as np
+import sys
+import os
+import onnxruntime as ort
+
+
+def preprocess_func(img, target):
+    w = 640
+    h = 640
+    a = cv2.resize(img, (w, h))
+    a_t = np.array(a).astype(np.float32)
+    boxes = []
+    for t in target:
+        boxes.append(t["bbox"])
+    target = np.array(boxes)
+    return a_t, target
+
+
+def pqt(onnx_f):
+    coco_root = os.path.expanduser("~/data/coco/images/val2017")
+    anno_f = os.path.expanduser("~/data/coco/annotations/instances_val2017_val_val_train.json")
+
+    session = ort.InferenceSession(onnx_f)
+    input_name = session.get_inputs()[0].name
+
+    calib_dataloader = get_calib_dataloader_coco(
+        coco_root, anno_f, preprocess_func=preprocess_func, input_names=input_name, bs=1, max_step=50
+    )
+    quantize_static_onnx(onnx_f, calib_dataloader=calib_dataloader)
+
+
+if __name__ == "__main__":
+    onnx_f = sys.argv[1]
+    pqt(onnx_f)
diff --git a/deploy/quant_onnx/qt_ort_cpu.py b/deploy/quant_onnx/qt_ort_cpu.py
new file mode 100644
index 0000000..412dbb2
--- /dev/null
+++ b/deploy/quant_onnx/qt_ort_cpu.py
@@ -0,0 +1,153 @@
+"""
+Test using onnxruntime to quantization a int8 CPU model
+"""
+from cv2 import calibrationMatrixValues
+from onnxruntime.quantization import quantize_static, CalibrationMethod
+from onnxruntime.quantization import CalibrationDataReader, QuantFormat, QuantType
+
+import onnxruntime as ort
+from PIL import Image
+import numpy as np
+import os
+import glob
+import time
+import sys
+from torchvision import models
+from torchvision import transforms
+import torchvision
+import torch
+
+
+def prepare_dataloader(num_workers=8, train_batch_size=128, eval_batch_size=256):
+    train_transform = transforms.Compose(
+        [
+            transforms.RandomCrop(32, padding=4),
+            transforms.RandomHorizontalFlip(),
+            transforms.ToTensor(),
+            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+        ]
+    )
+    test_transform = transforms.Compose(
+        [
+            # transforms.RandomCrop(224, padding=4),
+            transforms.ToTensor(),
+            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+        ]
+    )
+    train_set = torchvision.datasets.CIFAR10(
+        root="data", train=True, download=True, transform=train_transform
+    )
+    # We will use test set for validation and test in this project.
+    # Do not use test set for validation in practice!
+    test_set = torchvision.datasets.CIFAR10(
+        root="data", train=False, download=True, transform=test_transform
+    )
+    train_sampler = torch.utils.data.RandomSampler(train_set)
+    test_sampler = torch.utils.data.SequentialSampler(test_set)
+
+    train_loader = torch.utils.data.DataLoader(
+        dataset=train_set,
+        batch_size=train_batch_size,
+        sampler=train_sampler,
+        num_workers=num_workers,
+    )
+    test_loader = torch.utils.data.DataLoader(
+        dataset=test_set,
+        batch_size=eval_batch_size,
+        sampler=test_sampler,
+        num_workers=num_workers,
+    )
+    return train_loader, test_loader
+
+
+class CalibDataLoaderFromDataLoader(CalibrationDataReader):
+    def __init__(self, test_loader) -> None:
+        super().__init__()
+        self.test_loader = iter(test_loader)
+
+    def get_next(self) -> dict:
+        res = next(self.test_loader, None)
+        if res:
+            images, labels = res
+            if isinstance(images, torch.Tensor):
+                images = images.cpu().numpy()
+            return {"data": images}
+        else:
+            return None
+
+
+def evaluate_onnx_model(model_p, test_loader, criterion=None):
+    running_loss = 0
+    running_corrects = 0
+
+    session = ort.InferenceSession(model_p)
+    input_name = session.get_inputs()[0].name
+
+    total = 0.0
+    for inputs, labels in test_loader:
+        inputs = inputs.cpu().numpy()
+        labels = labels.cpu().numpy()
+
+        start = time.perf_counter()
+        outputs = session.run([], {input_name: inputs})
+        end = (time.perf_counter() - start) * 1000
+        total += end
+
+        outputs = outputs[0]
+        preds = np.argmax(outputs, 1)
+        if criterion is not None:
+            loss = criterion(outputs, labels).item()
+        else:
+            loss = 0
+        # statistics
+        running_corrects += np.sum(preds == labels)
+
+    # eval_loss = running_loss / len(test_loader.dataset)
+    eval_accuracy = running_corrects / len(test_loader.dataset)
+    total /= len(test_loader)
+    print(f"eval loss: {0}, eval acc: {eval_accuracy}, cost: {total}")
+    return 0, eval_accuracy
+
+
+def run_time(model_p):
+    session = ort.InferenceSession(model_p)
+    input_name = session.get_inputs()[0].name
+    total = 0.0
+    runs = 10
+    input_data = np.zeros((1, 3, 224, 224), np.float32)
+    _ = session.run([], {input_name: input_data})
+    for i in range(runs):
+        start = time.perf_counter()
+        _ = session.run([], {input_name: input_data})
+        end = (time.perf_counter() - start) * 1000
+        total += end
+        print(f"{end:.2f}ms")
+    total /= runs
+    print(f"Avg: {total:.2f}ms")
+
+
+if __name__ == "__main__":
+    model_p = sys.argv[1]
+    model_qp = os.path.join(
+        os.path.dirname(model_p),
+        os.path.basename(model_p).replace(".onnx", "_int8.onnx"),
+    )
+
+    train_loader, test_loader = prepare_dataloader(eval_batch_size=2)
+    dr = CalibDataLoaderFromDataLoader(test_loader)
+    quantize_static(
+        model_p,
+        model_qp,
+        dr,
+        quant_format=QuantFormat.QOperator,
+        per_channel=True,
+        weight_type=QuantType.QInt8,
+        calibrate_method=CalibrationMethod.MinMax,
+    )
+    print("Calibrated and quantied.")
+
+    run_time(model_qp)
+    evaluate_onnx_model(model_qp, test_loader)
+    run_time(model_p)
+    evaluate_onnx_model(model_p, test_loader)
+ 
diff --git a/deploy/quant_onnx/qt_ort_yolox.py b/deploy/quant_onnx/qt_ort_yolox.py
new file mode 100644
index 0000000..a8f96f0
--- /dev/null
+++ b/deploy/quant_onnx/qt_ort_yolox.py
@@ -0,0 +1,152 @@
+"""
+Test using onnxruntime to quantization a int8 CPU model
+"""
+from cv2 import calibrationMatrixValues
+from onnxruntime.quantization import quantize_static, CalibrationMethod
+from onnxruntime.quantization import CalibrationDataReader, QuantFormat, QuantType
+
+import onnxruntime as ort
+from PIL import Image
+import numpy as np
+import os
+import glob
+import time
+import sys
+from torchvision import models
+from torchvision import transforms
+import torchvision
+import torch
+
+
+def prepare_dataloader(num_workers=8, train_batch_size=128, eval_batch_size=256):
+    train_transform = transforms.Compose(
+        [
+            transforms.RandomCrop(32, padding=4),
+            transforms.RandomHorizontalFlip(),
+            transforms.ToTensor(),
+            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+        ]
+    )
+    test_transform = transforms.Compose(
+        [
+            # transforms.RandomCrop(224, padding=4),
+            transforms.ToTensor(),
+            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+        ]
+    )
+    train_set = torchvision.datasets.CIFAR10(
+        root="data", train=True, download=True, transform=train_transform
+    )
+    # We will use test set for validation and test in this project.
+    # Do not use test set for validation in practice!
+    test_set = torchvision.datasets.CIFAR10(
+        root="data", train=False, download=True, transform=test_transform
+    )
+    train_sampler = torch.utils.data.RandomSampler(train_set)
+    test_sampler = torch.utils.data.SequentialSampler(test_set)
+
+    train_loader = torch.utils.data.DataLoader(
+        dataset=train_set,
+        batch_size=train_batch_size,
+        sampler=train_sampler,
+        num_workers=num_workers,
+    )
+    test_loader = torch.utils.data.DataLoader(
+        dataset=test_set,
+        batch_size=eval_batch_size,
+        sampler=test_sampler,
+        num_workers=num_workers,
+    )
+    return train_loader, test_loader
+
+
+class CalibDataLoaderFromDataLoader(CalibrationDataReader):
+    def __init__(self, test_loader) -> None:
+        super().__init__()
+        self.test_loader = iter(test_loader)
+
+    def get_next(self) -> dict:
+        res = next(self.test_loader, None)
+        if res:
+            images, labels = res
+            if isinstance(images, torch.Tensor):
+                images = images.cpu().numpy()
+            return {"data": images}
+        else:
+            return None
+
+
+def evaluate_onnx_model(model_p, test_loader, criterion=None):
+    running_loss = 0
+    running_corrects = 0
+
+    session = ort.InferenceSession(model_p)
+    input_name = session.get_inputs()[0].name
+
+    total = 0.0
+    for inputs, labels in test_loader:
+        inputs = inputs.cpu().numpy()
+        labels = labels.cpu().numpy()
+
+        start = time.perf_counter()
+        outputs = session.run([], {input_name: inputs})
+        end = (time.perf_counter() - start) * 1000
+        total += end
+
+        outputs = outputs[0]
+        preds = np.argmax(outputs, 1)
+        if criterion is not None:
+            loss = criterion(outputs, labels).item()
+        else:
+            loss = 0
+        # statistics
+        running_corrects += np.sum(preds == labels)
+
+    # eval_loss = running_loss / len(test_loader.dataset)
+    eval_accuracy = running_corrects / len(test_loader.dataset)
+    total /= len(test_loader)
+    print(f"eval loss: {0}, eval acc: {eval_accuracy}, cost: {total}")
+    return 0, eval_accuracy
+
+
+def run_time(model_p):
+    session = ort.InferenceSession(model_p)
+    input_name = session.get_inputs()[0].name
+    total = 0.0
+    runs = 10
+    input_data = np.zeros((1, 3, 224, 224), np.float32)
+    _ = session.run([], {input_name: input_data})
+    for i in range(runs):
+        start = time.perf_counter()
+        _ = session.run([], {input_name: input_data})
+        end = (time.perf_counter() - start) * 1000
+        total += end
+        print(f"{end:.2f}ms")
+    total /= runs
+    print(f"Avg: {total:.2f}ms")
+
+
+if __name__ == "__main__":
+    model_p = sys.argv[1]
+    model_qp = os.path.join(
+        os.path.dirname(model_p),
+        os.path.basename(model_p).replace(".onnx", "_int8.onnx"),
+    )
+
+    train_loader, test_loader = prepare_dataloader(eval_batch_size=2)
+    dr = CalibDataLoaderFromDataLoader(test_loader)
+    quantize_static(
+        model_p,
+        model_qp,
+        dr,
+        quant_format=QuantFormat.QOperator,
+        per_channel=True,
+        weight_type=QuantType.QInt8,
+        calibrate_method=CalibrationMethod.MinMax,
+    )
+    print("Calibrated and quantied.")
+
+    run_time(model_p)
+    evaluate_onnx_model(model_p, test_loader)
+    run_time(model_qp)
+    evaluate_onnx_model(model_qp, test_loader)
diff --git a/deploy/quant_onnx/qt_trt.py b/deploy/quant_onnx/qt_trt.py
new file mode 100644
index 0000000..6c206d3
--- /dev/null
+++ b/deploy/quant_onnx/qt_trt.py
@@ -0,0 +1,177 @@
+from multiprocessing.spawn import prepare
+import torch
+import torch.utils.data
+from torch import nn
+
+from pytorch_quantization import nn as quant_nn
+from pytorch_quantization import calib
+from pytorch_quantization.tensor_quant import QuantDescriptor
+from tqdm import tqdm
+import torchvision
+from torchvision import models
+from pytorch_quantization import quant_modules
+from torchvision import transforms
+import os
+import time
+
+quant_modules.initialize()
+
+
+def prepare_dataloader(num_workers=8, train_batch_size=128, eval_batch_size=256):
+    train_transform = transforms.Compose(
+        [
+            transforms.RandomCrop(32, padding=4),
+            transforms.RandomHorizontalFlip(),
+            transforms.ToTensor(),
+            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+        ]
+    )
+    test_transform = transforms.Compose(
+        [
+            transforms.ToTensor(),
+            transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
+        ]
+    )
+    train_set = torchvision.datasets.CIFAR10(
+        root="data", train=True, download=True, transform=train_transform
+    )
+    # We will use test set for validation and test in this project.
+    # Do not use test set for validation in practice!
+    test_set = torchvision.datasets.CIFAR10(
+        root="data", train=False, download=True, transform=test_transform
+    )
+    train_sampler = torch.utils.data.RandomSampler(train_set)
+    test_sampler = torch.utils.data.SequentialSampler(test_set)
+
+    train_loader = torch.utils.data.DataLoader(
+        dataset=train_set,
+        batch_size=train_batch_size,
+        sampler=train_sampler,
+        num_workers=num_workers,
+    )
+    test_loader = torch.utils.data.DataLoader(
+        dataset=test_set,
+        batch_size=eval_batch_size,
+        sampler=test_sampler,
+        num_workers=num_workers,
+    )
+    return train_loader, test_loader
+
+
+def evaluate_model(model, test_loader, device=torch.device("cpu"), criterion=None):
+    t0 = time.time()
+    model.eval()
+    model.to(device)
+    running_loss = 0
+    running_corrects = 0
+    for inputs, labels in test_loader:
+
+        inputs = inputs.to(device)
+        labels = labels.to(device)
+        outputs = model(inputs)
+        _, preds = torch.max(outputs, 1)
+
+        if criterion is not None:
+            loss = criterion(outputs, labels).item()
+        else:
+            loss = 0
+
+        # statistics
+        running_loss += loss * inputs.size(0)
+        running_corrects += torch.sum(preds == labels.data)
+
+    eval_loss = running_loss / len(test_loader.dataset)
+    eval_accuracy = running_corrects / len(test_loader.dataset)
+    t1 = time.time()
+    print(f"eval loss: {eval_loss}, eval acc: {eval_accuracy}, cost: {t1 - t0}")
+    return eval_loss, eval_accuracy
+
+
+def calib_and_quant(model, data_loader, test_loader):
+    def collect_stats(model, data_loader, num_batches):
+        """Feed data to the network and collect statistic"""
+
+        # Enable calibrators
+        for name, module in model.named_modules():
+            if isinstance(module, quant_nn.TensorQuantizer):
+                if module._calibrator is not None:
+                    module.disable_quant()
+                    module.enable_calib()
+                else:
+                    module.disable()
+
+        for i, (image, _) in tqdm(enumerate(data_loader), total=num_batches):
+            model(image.cuda())
+            if i >= num_batches:
+                break
+
+        # Disable calibrators
+        for name, module in model.named_modules():
+            if isinstance(module, quant_nn.TensorQuantizer):
+                if module._calibrator is not None:
+                    module.enable_quant()
+                    module.disable_calib()
+                else:
+                    module.enable()
+
+    def compute_amax(model, **kwargs):
+        # Load calib result
+        for name, module in model.named_modules():
+            if isinstance(module, quant_nn.TensorQuantizer):
+                if module._calibrator is not None:
+                    if isinstance(module._calibrator, calib.MaxCalibrator):
+                        module.load_calib_amax()
+                    else:
+                        module.load_calib_amax(**kwargs)
+                print(f"{name:40}: {module}")
+        model.cuda()
+
+    quant_desc_input = QuantDescriptor(calib_method="histogram")
+    quant_nn.QuantConv2d.set_default_quant_desc_input(quant_desc_input)
+    quant_nn.QuantLinear.set_default_quant_desc_input(quant_desc_input)
+
+    model.cuda()
+    with torch.no_grad():
+        collect_stats(model, data_loader, num_batches=2)
+        compute_amax(model, method="percentile", percentile=99.99)
+
+    evaluate_model(model, test_loader)
+
+    # Save the model
+    torch.save(model.state_dict(), "/tmp/quant_resnet50-calibrated.pth")
+    return model
+
+
+def quant():
+    train_loader, test_loader = prepare_dataloader()
+    model = torchvision.models.resnet18()
+    model.fc = nn.Linear(512, 10)
+    if os.path.exists("r18_raw.pth"):
+        model.load_state_dict(torch.load("r18_raw.pth", map_location="cpu"))
+    else:
+        # train_model(model, train_loader, test_loader, torch.device("cuda"))
+        print("train finished.")
+        torch.save(model.state_dict(), "r18_row.pth")
+
+    with torch.no_grad():
+        model_quant = calib_and_quant(model, train_loader, test_loader)
+        model_quant.cuda()
+        dummy_input = torch.randn(128, 3, 224, 224, device="cuda")
+
+        input_names = ["actual_input_1"]
+        output_names = ["output1"]
+
+        quant_nn.TensorQuantizer.use_fb_fake_quant = True
+        print(model_quant)
+        # enable_onnx_checker needs to be disabled. See notes below.
+        torch.onnx.export(
+            model_quant,
+            dummy_input,
+            "quant_r18.onnx",
+            verbose=False,
+            opset_version=13,
+        )
+
+
+if __name__ == "__main__":
+    quant()
diff --git a/deploy/quant_onnx/readme.md b/deploy/quant_onnx/readme.md
new file mode 100644
index 0000000..638b1ba
--- /dev/null
+++ b/deploy/quant_onnx/readme.md
@@ -0,0 +1,8 @@
+# Quant ONNX
+
+we using tools from ONNXRuntime to directly quantize onnx models and save int8 onnx model.
+
+
+## Log
+
+- `2022.04.17`: quantize sparseinst and keypoints failed. Seems need all opset=13 to do quant, opset>12 will caused strange result when quantize in onnxruntime;
diff --git a/deploy/quant_onnx/test_images/daisy.jpg b/deploy/quant_onnx/test_images/daisy.jpg
new file mode 100644
index 0000000..19c862c
Binary files /dev/null and b/deploy/quant_onnx/test_images/daisy.jpg differ
diff --git a/deploy/quant_onnx/test_images/rose.jpg b/deploy/quant_onnx/test_images/rose.jpg
new file mode 100644
index 0000000..5080a1c
Binary files /dev/null and b/deploy/quant_onnx/test_images/rose.jpg differ
diff --git a/deploy/quant_onnx/test_images/tulip.jpg b/deploy/quant_onnx/test_images/tulip.jpg
new file mode 100644
index 0000000..55297ed
Binary files /dev/null and b/deploy/quant_onnx/test_images/tulip.jpg differ
diff --git a/deploy/quant_tvm.py b/deploy/quant_tvm.py
old mode 100755
new mode 100644
diff --git a/deploy/trt_cc/.gitignore b/deploy/trt_cc/.gitignore
old mode 100755
new mode 100644
diff --git a/deploy/trt_cc/CMakeLists.txt b/deploy/trt_cc/CMakeLists.txt
old mode 100755
new mode 100644
diff --git a/deploy/trt_cc/demo_yolox.cc b/deploy/trt_cc/demo_yolox.cc
old mode 100755
new mode 100644
diff --git a/deploy/trt_cc/demo_yolox_origin.cc b/deploy/trt_cc/demo_yolox_origin.cc
old mode 100755
new mode 100644
diff --git a/deploy/trt_cc/logging.h b/deploy/trt_cc/logging.h
old mode 100755
new mode 100644
diff --git a/deploy/trt_cc/readme.md b/deploy/trt_cc/readme.md
old mode 100755
new mode 100644
diff --git a/docs/install.md b/docs/install.md
new file mode 100644
index 0000000..8c870ea
--- /dev/null
+++ b/docs/install.md
@@ -0,0 +1,63 @@
+# YOLOv7 Install
+
+> Please install detectron2 first, this is the basic dependency. For detectron2 just clone official repo and install follow their instructions.
+
+yolov7 is not a lib, it's a project ready for use. But to install dependencies, there still need some process. 
+
+First, please consider install 2 important lib that you might not familliar with:
+
+```
+alfred-py
+nbnb
+```
+
+Both of them can be installed from pip. The first one provides enhanced and full-featured visualization utils for drawing boxes, masks etc. And it provides some very convenient tools for users to visialization your coco dataset (VOC, YOLO format also supported). After install, you can call `alfred` to get more details.
+
+`nbnb` is a lib that provides some useful common network blocks.
+
+Also, if you need fbnetv3, you need install mobilecv from FaceBook:
+
+```
+pip install git+https://github.com/facebookresearch/mobile-vision.git
+```
+
+After install, you can now ready to train with YOLOv7.
+
+```
+python train_net.py --config-file configs/coco/darknet53.yaml --num-gpus 8
+```
+
+train YOLOX:
+
+```
+python train_net.py --config-file configs/coco/yolox_s.yaml --num-gpus 8
+```
+
+## Train on Custom dataset
+
+If you want train on custom dataset, you **just need convert your dataset to coco format**. And that's all, that's all you need do.
+
+Then you just need create a new folder of your dataset under `configs`, and set your data path in config, take VisDrone dataset as example:
+
+```
+DATASETS:
+  TRAIN: ("visdrone_train",)
+  TEST: ("visdrone_val",)
+```
+
+Then register your dataset in `train_visdrone.py`:
+
+```
+DATASET_ROOT = './datasets/visdrone'
+ANN_ROOT = os.path.join(DATASET_ROOT, 'visdrone_coco_anno')
+TRAIN_PATH = os.path.join(DATASET_ROOT, 'VisDrone2019-DET-train/images')
+VAL_PATH = os.path.join(DATASET_ROOT, 'VisDrone2019-DET-val/images')
+TRAIN_JSON = os.path.join(ANN_ROOT, 'VisDrone2019-DET_train_coco.json')
+VAL_JSON = os.path.join(ANN_ROOT, 'VisDrone2019-DET_val_coco.json')
+
+register_coco_instances("visdrone_train", {}, TRAIN_JSON, TRAIN_PATH)
+register_coco_instances("visdrone_val", {}, VAL_JSON, VAL_PATH)
+```
+
+Here, you set your json path, your images path, then you are ready to go.
+
diff --git a/docs/usage.md b/docs/usage.md
new file mode 100644
index 0000000..2b4e2cf
--- /dev/null
+++ b/docs/usage.md
@@ -0,0 +1,30 @@
+## Training
+
+You can refer to `install.md` for preparing your own dataset. Basically, just convert your dataset into coco format, and it's ready to go.
+
+We have 3 **key** train scripts, they are:
+
+- `train_coco.py`: this is basically most common used train script for coco;
+- `train_detr.py`: use this for **any** DETR or transformer based model;
+- `train_net.py`: Experimented changing training strategy script, **used for experiement**;
+- `train_custom_datasets.py`: train all customized datasets;
+
+For demo usage, you can using:
+
+- `demo.py`: for demo visualize result;
+- `demo_lazyconfig.py`: for demo using `*.py` as config file;
+
+
+## Inference
+
+You can direcly call `demo.py` to inference, visualize. A classic command would be:
+
+```
+python demo.py --config-file configs/coco/sparseinst/sparse_inst_r50vd_giam_aug.yaml --video-input ~/Movies/Videos/86277963_nb2-1-80.flv -c 0.4 --opts MODEL.WEIGHTS weights/sparse_inst_r50vd_giam_aug_8bc5b3.pth
+```
+
+## Deploy
+
+YOLOv7 can be easily deploy via ONNX, you can using `export_onnx.py` and according config file to convert.
+
+You u got any problems on any model arch, please fire an issue.
diff --git a/export_onnx.py b/export.py
old mode 100755
new mode 100644
similarity index 68%
rename from export_onnx.py
rename to export.py
index 426d132..ec79d64
--- a/export_onnx.py
+++ b/export.py
@@ -5,7 +5,6 @@
 import os
 import time
 import cv2
-from numpy.core.fromnumeric import sort
 import tqdm
 import torch
 import time
@@ -21,9 +20,7 @@
 from detectron2.checkpoint import DetectionCheckpointer
 
 from yolov7.config import add_yolo_config
-import onnx_graphsurgeon as gs
 import onnx
-
 from alfred.vis.image.mask import label2color_mask, vis_bitmasks
 from alfred.vis.image.det import visualize_det_cv2_part, visualize_det_cv2_fancy
 from alfred.dl.torch.common import device
@@ -45,7 +42,6 @@
 
 
 class DefaultPredictor:
-
     def __init__(self, cfg):
         self.cfg = cfg.clone()  # cfg can be modified by model
         self.model = build_model(self.cfg)
@@ -68,15 +64,13 @@ def __call__(self, original_image):
             if self.input_format == "RGB":
                 original_image = original_image[:, :, ::-1]
             height, width = original_image.shape[:2]
-            image = self.aug.get_transform(
-                original_image).apply_image(original_image)
-            print('image after transform: ', image.shape)
-            image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1))
+            image = self.aug.get_transform(original_image).apply_image(original_image)
+            print("image after transform: ", image.shape)
+            # image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1))
+            # do not do transpose here
+            image = torch.as_tensor(image.astype("float32"))
             inputs = {"image": image, "height": height, "width": width}
-            tic = time.time()
             predictions = self.model([inputs])[0]
-            c = time.time() - tic
-            print('cost: {}, fps: {}'.format(c, 1/c))
             return predictions
 
 
@@ -102,21 +96,21 @@ def setup_cfg(args):
 
 
 def get_parser():
-    parser = argparse.ArgumentParser(
-        description="Detectron2 demo for builtin configs")
+    parser = argparse.ArgumentParser(description="Detectron2 demo for builtin configs")
     parser.add_argument(
         "--config-file",
         default="configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml",
         metavar="FILE",
         help="path to config file",
     )
-    parser.add_argument("--webcam", action="store_true",
-                        help="Take inputs from webcam.")
+    parser.add_argument(
+        "--webcam", action="store_true", help="Take inputs from webcam."
+    )
     parser.add_argument("--video-input", help="Path to video file.")
     parser.add_argument(
         "--input",
         # nargs="+",
-        default='./images/COCO_val2014_000000001722.jpg',
+        default="./images/COCO_val2014_000000001722.jpg",
         help="A list of space separated input images; "
         "or a single glob pattern such as 'directory/*.jpg'",
     )
@@ -136,7 +130,7 @@ def get_parser():
         "-v",
         "--verbose",
         default=False,
-        action='store_true',
+        action="store_true",
         help="verbose when onnx export",
     )
     parser.add_argument(
@@ -149,18 +143,23 @@ def get_parser():
 
 
 def change_detr_onnx(onnx_path):
-    '''
+    """
     Fix default detr onnx model output all 0
-    '''
-    node_configs = [(1660, 1662), (2775, 2777), (2961, 2963),
-                    (3333, 3335), (4077, 4079)]
-    if 'batch_2' in onnx_path:
+    """
+    node_configs = [
+        (1660, 1662),
+        (2775, 2777),
+        (2961, 2963),
+        (3333, 3335),
+        (4077, 4079),
+    ]
+    if "batch_2" in onnx_path:
         node_number = node_configs[1]
-    elif 'batch_4' in onnx_path:
+    elif "batch_4" in onnx_path:
         node_number = node_configs[2]
-    elif 'batch_8' in onnx_path:
+    elif "batch_8" in onnx_path:
         node_number = node_configs[3]
-    elif 'batch_16' in onnx_path:
+    elif "batch_16" in onnx_path:
         node_number = node_configs[4]
     else:
         node_number = node_configs[0]
@@ -176,25 +175,32 @@ def change_detr_onnx(onnx_path):
             node.inputs[1].values = np.int64(5)
             print(node.inputs[1])
 
-    onnx.save(gs.export_onnx(graph), onnx_path + '_changed.onnx')
+    onnx.save(gs.export_onnx(graph), onnx_path + "_changed.onnx")
     print(f"[INFO] onnx修改完成, 保存在{onnx_path + '_changed.onnx'}.")
 
 
-def load_test_image(f, h, w):
+def load_test_image(f, h, w, bs=1):
     a = cv2.imread(f)
     a = cv2.resize(a, (w, h))
-    a_t = torch.tensor(a.astype(np.float32)).to(device).unsqueeze(0)
+    a_t = torch.tensor(a.astype(np.float32)).to(device).unsqueeze(0).repeat(bs, 1, 1, 1)
     return a_t, a
 
 
 def load_test_image_detr(f, h, w):
     """
-    detr do not using 
+    detr do not using
     """
     a = cv2.imread(f)
     a = cv2.resize(a, (w, h))
     a_t = torch.tensor(a.astype(np.float32)).permute(2, 0, 1).to(device)
-    return torch.stack([a_t, ]), a
+    return (
+        torch.stack(
+            [
+                a_t,
+            ]
+        ),
+        a,
+    )
     # return torch.stack([a_t, a_t]), a
 
 
@@ -222,57 +228,93 @@ def vis_res_fast(res, img, colors):
     clss = clss[indices]
 
     img = visualize_det_cv2_part(
-        img, scores, clss, bboxes, force_color=colors, is_show=True)
+        img, scores, clss, bboxes, force_color=colors, is_show=True
+    )
     # img = cv2.addWeighted(img, 0.9, m, 0.6, 0.9)
     return img
 
 
+def get_model_infos(config_file):
+    if "sparse_inst" in config_file:
+        output_names = ["masks", "scores", "labels"]
+        # output_names = ["masks", "scores"]
+        input_names = ["images"]
+        dynamic_axes = {"images": {0: "batch"}}
+        return input_names, output_names, dynamic_axes
+    elif "detr" in config_file:
+        return ["boxes", "scores", "labels"]
+    else:
+        return ["outs"]
+
+
 if __name__ == "__main__":
     mp.set_start_method("spawn", force=True)
     args = get_parser().parse_args()
     setup_logger(name="fvcore")
     logger = setup_logger()
     logger.info("Arguments: " + str(args))
-    assert os.path.isfile(
-        args.input), 'onnx export only support send a image file.'
+    assert os.path.isfile(args.input), "onnx export only support send a image file."
 
     cfg = setup_cfg(args)
-    colors = [[random.randint(0, 255) for _ in range(3)]
-              for _ in range(cfg.MODEL.YOLO.CLASSES)]
+    colors = [
+        [random.randint(0, 255) for _ in range(3)]
+        for _ in range(cfg.MODEL.YOLO.CLASSES)
+    ]
 
     metadata = MetadataCatalog.get(cfg.DATASETS.TEST[0])
     predictor = DefaultPredictor(cfg)
 
-    h = 768
-    w = 960
-    # h = 640
-    # w = 640
-    # inp, ori_img = load_test_image(args.input, h, w)
-    inp, ori_img = load_test_image_detr(args.input, h, w)
-    print('input shape: ', inp.shape)
-    # inp = inp.to(torch.device('cuda'))
+    # h = 1056
+    # w = 1920
+    h = 640
+    w = 640
+    inp, ori_img = load_test_image(args.input, h, w)
+    # TODO: remove hard coded for detr
+    # inp, ori_img = load_test_image_detr(args.input, h, w)
+    logger.info(f"input shape: {inp.shape}")
 
     model = predictor.model
     model = model.float()
     model.onnx_export = True
 
     onnx_f = os.path.join(
-        'weights', os.path.basename(cfg.MODEL.WEIGHTS).split('.')[0] + '.onnx')
-    torch.onnx.export(model, inp, onnx_f, output_names={
-                      'out'}, opset_version=12, do_constant_folding=True, verbose=args.verbose)
-    logger.info('Model saved into: {}'.format(onnx_f))
+        "weights", os.path.basename(cfg.MODEL.WEIGHTS).split(".")[0] + ".onnx"
+    )
+
+    input_names, output_names, dynamic_axes = get_model_infos(args.config_file)
+    torch.onnx.export(
+        model,
+        inp,
+        onnx_f,
+        input_names=input_names,
+        output_names=output_names,
+        opset_version=11,
+        do_constant_folding=True,
+        verbose=args.verbose,
+        dynamic_axes=dynamic_axes,
+    )
+    logger.info("Model saved into: {}".format(onnx_f))
 
     # use onnxsimplify to reduce reduent model.
-    sim_onnx = onnx_f.replace('.onnx', '_sim.onnx')
-    os.system("python3 -m onnxsim {} {}".format(onnx_f, sim_onnx))
+    sim_onnx = onnx_f.replace(".onnx", "_sim.onnx")
+    os.system(
+        f"python3 -m onnxsim {onnx_f} {sim_onnx} --dynamic-input-shape --input-shape 1,{h},{w},3"
+    )
     logger.info("generate simplify onnx to: {}".format(sim_onnx))
-    if 'detr' in sim_onnx:
+    if "detr" in sim_onnx:
         # this is need for detr onnx model
         change_detr_onnx(sim_onnx)
 
-    logger.info('test if onnx export logic is right...')
+    logger.info("test if onnx export logic is right...")
     model.onnx_vis = True
     out = model(inp)
     out = detr_postprocess(out, ori_img)
     # detr postprocess
     vis_res_fast(out, ori_img, colors=colors)
+
+    logger.info('Now tracing model into torchscript.. If this failed, just ignore it.')
+    ts_f = os.path.join(
+        'weights', os.path.basename(cfg.MODEL.WEIGHTS).split('.')[0] + '.pt')
+    traced = torch.jit.trace(model, inp)
+    torch.jit.save(traced, ts_f)
+    logger.info('Model saved into: {}'.format(ts_f))
diff --git a/images/COCO_val2014_000000001722.jpg b/images/COCO_val2014_000000001722.jpg
old mode 100755
new mode 100644
diff --git a/images/COCO_val2014_000000001856.jpg b/images/COCO_val2014_000000001856.jpg
old mode 100755
new mode 100644
diff --git a/images/COCO_val2014_000000001869.jpg b/images/COCO_val2014_000000001869.jpg
old mode 100755
new mode 100644
diff --git a/images/COCO_val2014_000000001960.jpg b/images/COCO_val2014_000000001960.jpg
old mode 100755
new mode 100644
diff --git a/images/COCO_val2014_000000002149.jpg b/images/COCO_val2014_000000002149.jpg
old mode 100755
new mode 100644
diff --git a/images/COCO_val2014_000000002153.jpg b/images/COCO_val2014_000000002153.jpg
old mode 100755
new mode 100644
diff --git a/images/COCO_val2014_000000002171.jpg b/images/COCO_val2014_000000002171.jpg
old mode 100755
new mode 100644
diff --git a/images/COCO_val2014_000000002315.jpg b/images/COCO_val2014_000000002315.jpg
old mode 100755
new mode 100644
diff --git a/images/COCO_val2014_000000002532.jpg b/images/COCO_val2014_000000002532.jpg
old mode 100755
new mode 100644
diff --git a/images/dog.jpg b/images/dog.jpg
new file mode 100644
index 0000000..77b0381
Binary files /dev/null and b/images/dog.jpg differ
diff --git a/images/mask/u=1506317376,3450613040&fm=26&fmt=auto&gp=0.jpg b/images/mask/u=1506317376,3450613040&fm=26&fmt=auto&gp=0.jpg
old mode 100755
new mode 100644
diff --git a/images/mask/u=3352497688,3286290828&fm=26&fmt=auto&gp=0.jpg b/images/mask/u=3352497688,3286290828&fm=26&fmt=auto&gp=0.jpg
old mode 100755
new mode 100644
diff --git a/images/mask/u=3557104275,359021270&fm=26&fmt=auto&gp=0.jpg b/images/mask/u=3557104275,359021270&fm=26&fmt=auto&gp=0.jpg
old mode 100755
new mode 100644
diff --git a/images/mask/u=4153583989,584404369&fm=26&fmt=auto&gp=0.jpg b/images/mask/u=4153583989,584404369&fm=26&fmt=auto&gp=0.jpg
old mode 100755
new mode 100644
diff --git a/images/mask/u=724341885,3385420344&fm=26&fmt=auto&gp=0.jpg b/images/mask/u=724341885,3385420344&fm=26&fmt=auto&gp=0.jpg
old mode 100755
new mode 100644
diff --git a/log.md b/log.md
deleted file mode 100755
index 61359ef..0000000
--- a/log.md
+++ /dev/null
@@ -1,495 +0,0 @@
-- 2021.08.30:
-  
-  We now get another AP of YOLOX without preprocessing:
-
-  ```
-  |   AP   |  AP50  |  AP75  |  APs   |  APm   |  APl   |
-  |:------:|:------:|:------:|:------:|:------:|:------:|
-  | 38.053 | 57.665 | 41.176 | 23.614 | 41.971 | 47.851 |
-  ```
-  maybe we need train with mixup as well.
-
-  updated r2-50-fpn result, add SPP module:
-
-  ```
-  |   AP   |  AP50  |  AP75  |  APs   |  APm   |  APl   |
-  |:------:|:------:|:------:|:------:|:------:|:------:|
-  | 37.460 | 60.874 | 39.977 | 23.974 | 39.842 | 48.025 |
-  ```
-
-
-- 2021.08.26:
-
-  A larger version of r2-50 YOLOv7　(double channel in FPN head) get:
-
-  ```
-  |   AP   |  AP50  |  AP75  |  APs   |  APm   |  APl   |
-  |:------:|:------:|:------:|:------:|:------:|:------:|
-  | 38.201 | 61.046 | 41.049 | 22.268 | 41.216 | 50.649 |
-  ```
-  we need to know upper bound of this head.
-
-  also, am testing Regnetx-400Mf version, with normal FPN head. <- very important.
-
-
-- 2021.08.24:
-
-  Updated YOLOX trained result:
-
-  ```
-  |   AP   |  AP50  |  AP75  |  APs   |  APm   |  APl   |
-  |:------:|:------:|:------:|:------:|:------:|:------:|
-  | 37.964 | 57.483 | 40.947 | 23.728 | 42.267 | 47.245 |
-  ```
-  Almost 38, but we need disable normalnize in YOLOX as newly updated. Also applied FP16 enable to train.
-  swin-s YOLOv7 result:
-
-  ```
-  |   AP   |  AP50  |  AP75  |  APs   |  APm   |  APl   |
-  |:------:|:------:|:------:|:------:|:------:|:------:|
-  | 36.530 | 58.207 | 39.378 | 17.871 | 42.366 | 51.968 |
-  ```
-  I think transformer-based need larger iterations.
-
-
-- 2021.08.19:
-  
-  More epochs, and now r2-50 YOLOv7 get a better result:
-
-  ```
-  |   AP   |  AP50  |  AP75  |  APs   |  APm   |  APl   |
-  |:------:|:------:|:------:|:------:|:------:|:------:|
-  | 35.245 | 58.542 | 37.056 | 20.579 | 38.780 | 45.712 |
-  ```
-
-  And YOLOX get a better result as well (enable L1 loss at last 40k iters):
-
-  ```
-  |   AP   |  AP50  |  AP75  |  APs   |  APm   |  APl   |
-  |:------:|:------:|:------:|:------:|:------:|:------:|
-  | 37.733 | 57.336 | 40.774 | 22.439 | 41.614 | 48.046 |
-  ```
-
-  Next, gonna change YOLOv7 arch, add FPN and PAN, also, add dropblocks.
-  IoU aware training and SPP.
-
-
-- 2021.08.17:
-
-  I trained yolov5 again, if i bigger anchor_t -> 4.0, it can work a little:
-
-  ```
-  |   AP   |  AP50  |  AP75  |  APs   |  APm   |  APl   |
-  |:------:|:------:|:------:|:------:|:------:|:------:|
-  | 35.814 | 70.814 | 33.020 | 10.189 | 32.159 | 43.572 |
-  ```
-  but small result is poor, i also used YOLOv5 official coco's anchor settings.
-  so, does it work or not work? Hard to say.
-  
-
-- 2021.08.16:
-  
-  Finally, get result YOLOX trained AP:
-
-  ```
-  [08/14 06:28:23 d2.evaluation.coco_evaluation]: Evaluation results for bbox: 
-  |   AP   |  AP50  |  AP75  |  APs   |  APm   |  APl   |
-  |:------:|:------:|:------:|:------:|:------:|:------:|
-  | 36.572 | 56.028 | 39.497 | 22.921 | 40.583 | 46.090 |
-
-  ```
-  it not using stop augmentation trick and enable l1 loss at last 15 epochs. the overall iterations can be longger than 120000, etc. 180000, lr 0.02 -> 0.03
-
-  and r2-50 model:
-
-  ```
-  [08/14 08:13:06 d2.evaluation.coco_evaluation]: Evaluation results for bbox: 
-  |   AP   |  AP50  |  AP75  |  APs   |  APm   |  APl   |
-  |:------:|:------:|:------:|:------:|:------:|:------:|
-  | 34.950 | 58.071 | 37.104 | 20.319 | 37.877 | 45.645 |
-  ```
-
-  things needed to be added to YOLOv7:
-  - Dropblock;
-  - SPP;
-  - fpn and pan;
-  - enable l1 loss at last iterations;
-  - decouple head;
-  - IoU aware training.
-
-
-- 2021.08.12:
-
-  Now, we can reveal YOLOX eval result, but we have to train it, we forced using BGR as input order,
-  rather than RGB. since we don't want swap channel when opencv read the image, directly using BGR order in opencv.
-
-  ```
-  [08/12 01:08:32 d2.evaluation.coco_evaluation]: Evaluation results for bbox: 
-  |   AP   |  AP50  |  AP75  |  APs   |  APm   |  APl   |
-  |:------:|:------:|:------:|:------:|:------:|:------:|
-  | 25.162 | 43.746 | 25.611 | 14.494 | 29.015 | 31.481 |
-
-  ```
-
-  I still can not get a good mAP on r2-50.
-  But I found now, using YOLOX can achieve 28.9 mAP. (By changing the dataset and some params),
-  YOLOX can get a resonable AP:
-
-  ```
-  |   AP   |  AP50  |  AP75  |  APs   |  APm   |  APl   |
-  |:------:|:------:|:------:|:------:|:------:|:------:|
-  | 35.181 | 54.611 | 38.210 | 21.914 | 39.098 | 44.240 |
-
-  ```
-  Now get a good 35 mAP for YOLOX.
-  Once achieve to 38 or 37 than it can be reprecated.
-
-  Also, r2-50 get a good mAP now:
-
-  ```
-  |   AP   |  AP50  |  AP75  |  APs   |  APm   |  APl   |
-  |:------:|:------:|:------:|:------:|:------:|:------:|
-  | 30.731 | 54.494 | 31.426 | 16.675 | 33.858 | 39.642 |
-
-  ```
-  
-
-- 2021.08.11:
-
-  I try to reveal eval result of YOLOX, I using exactly same weights from YOLOX, first
-  I found the AP is 31, far less than 39 claimed in YOLOX. Finally found it was because of am using BGR format
-  by default, but YOLOX using RGB format.
-
-  After I change, the AP seems normal a little bit:
-
-  ```
-  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.351
-  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.535
-  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.384
-  Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.236
-  Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.406
-  Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.406
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.283
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.434
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.447
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.296
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.497
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.524
-  [08/11 16:57:32 d2.evaluation.coco_evaluation]: Evaluation results for bbox: 
-  |   AP   |  AP50  |  AP75  |  APs   |  APm   |  APl   |
-  |:------:|:------:|:------:|:------:|:------:|:------:|
-  | 35.138 | 53.534 | 38.365 | 23.565 | 40.614 | 40.553 |
-  ```
-
-  I changed the padding value from 0 to 114/255, now I can get a very close AP using YOLOX pretrained model:
-
-  ```
-  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.386
-  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.589
-  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.417
-  Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.239
-  Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.442
-  Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.472
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.316
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.515
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.551
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.380
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.607
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.652
-  [08/11 17:44:11 d2.evaluation.coco_evaluation]: Evaluation results for bbox: 
-  |   AP   |  AP50  |  AP75  |  APs   |  APm   |  APl   |
-  |:------:|:------:|:------:|:------:|:------:|:------:|
-  | 38.619 | 58.881 | 41.690 | 23.898 | 44.153 | 47.201 |
-  ```
-  Now get a better one:
-
-  ```
-  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.389
-  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.589
-  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.421
-  Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.237
-  Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.441
-  Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.486
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.318
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.526
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.575
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.403
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.635
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.684
-  [08/11 18:07:58 d2.evaluation.coco_evaluation]: Evaluation results for bbox: 
-  |   AP   |  AP50  |  AP75  |  APs   |  APm   |  APl   |
-  |:------:|:------:|:------:|:------:|:------:|:------:|
-  | 38.910 | 58.889 | 42.128 | 23.701 | 44.142 | 48.574 |
-
-  ```
-
-  However, still can not achieve YOLOX train, best now:
-
-  ```
-  [08/11 15:41:59 d2.evaluation.coco_evaluation]: Evaluation results for bbox: 
-  |   AP   |  AP50  |  AP75  |  APs   |  APm   |  APl   |
-  |:------:|:------:|:------:|:------:|:------:|:------:|
-  | 24.360 | 40.181 | 25.170 | 13.731 | 26.986 | 29.080 |
-  ```
-
-
-- 2021.08.07:
-
-  I found using new masic and fixed size might help a little bit result, but trained on VisDrone still not good:
-
-  iter 80000:
-  ```
-  |   AP   |  AP50  |  AP75  |  APs  |  APm   |  APl   |
-  |:------:|:------:|:------:|:-----:|:------:|:------:|
-  | 15.049 | 33.972 | 11.666 | 8.957 | 21.881 | 21.588 |  
-  ```
-  why the AP so low?
-
-  Next experiment, I need reveal YOLOX result, since everything is totally same.
-
-  - I added RandomeResizeShortest to yolov7 datamapper, so that it can handle multi-scale inputs;
-  - Train 1w to see how visdrone will effected, also we enables l1 loss in training by default;
-
-  I got mAP 21, mAP50 40 on visdrone. Not so bad. At least it seems low learning can avoid to local optimal.
-
-  got mAP 23, mAP50 41.4 using YOLOXs.
-  it seems still a gap between sota methods:
-
-  ```
-  [08/11 09:09:54 d2.evaluation.coco_evaluation]: Evaluation results for bbox:
-  |   AP   |  AP50  |  AP75  |  APs   |  APm   |  APl   |
-  |:------:|:------:|:------:|:------:|:------:|:------:|
-  | 23.423 | 42.075 | 22.554 | 15.606 | 33.356 | 34.577 |
-  ```
-
-- 2021.08.06:
-
-  train YOLOX but get bad result:
-
-  iter 60000:
-
-  ```
-  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.240
-  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.410
-  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.250
-  Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.133
-  Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.272
-  Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.289
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.227
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.369
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.392
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.233
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.430
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.470
-  [08/06 10:52:53 d2.evaluation.coco_evaluation]: Evaluation results for bbox: 
-  |   AP   |  AP50  |  AP75  |  APs   |  APm   |  APl   |
-  |:------:|:------:|:------:|:------:|:------:|:------:|
-  | 24.039 | 41.046 | 24.978 | 13.258 | 27.167 | 28.879 |
-  ```
-  How does it possible? Why all models can not get a reasonable AP?
-  I suspect it was because of bad data augmentation introduce from YOLOF, I copied mosiac aug from YOLOX and try again.
-  Now, new mosiac looks:
-  1. disabled resize, using directly input;
-  2. new mosiac can specifc input_size, and make it merge to real data to train;
-  3. new mosiac reduced zero width boxes.
-
-
-- 2021.08.04:
-
-  Exp on normal yolov4 loss on voc:
-
-  - Seems loss not drop, why? Same config as coco, but coco at least can drop;
-
-- 2021.08.03:
-
-  Important notes:
-
-  - Using divide to num_fg can get a lower conf loss, training can be stable;
-  - obj_mask must multiply conf loss, otherwise you will miss a lot fg objects.
-  
-  Original yolov3 loss actually can work. we need stick with it. Push it's limitions.
-  On the other hand, we need make YOLOv5 also able training, so that we can fuse YOLOv5 target tricks easily.
-  Waiting for YOLOX's training result.
-
-- 2021.08.01:
-
-  I tried train on some large dataset such as VOC, currently tested r2-50 YOLOv7 arch we got:
-
-  ```
-  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.352
-  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.809
-  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.247
-  Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.340
-  Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.395
-  Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.343
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.275
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.436
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.448
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.384
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.461
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.442
-  [08/01 12:04:43 d2.evaluation.coco_evaluation]: Evaluation results for bbox:
-  |   AP   |  AP50  |  AP75  |  APs   |  APm   |  APl   |
-  |:------:|:------:|:------:|:------:|:------:|:------:|
-  | 35.193 | 80.946 | 24.743 | 33.998 | 39.462 | 34.337 |
-  ```
-  I think AP50 slight lower than standard YOLOv3, but seems the whole arch has no problem.
-  Next try test x-s-pafpn result with loss enhancement. Make sure it can perform well.
-
-  Method to achieve a higher AP:
-  - Compact head design;
-  - SPP + PAN should perform better than r2-50, exp it;
-  - label smoothing + MixUp;
-
-  - Train YOLOX, is the augmentation useful or not?
-
-- 2021.07.26:
-
-  2 things need to do for now:
-
-  1). xs_pafpn seems have a stable performance, reproduced it with batchsize 128;
-  2). Reproduce coco result with cocomini, with batchsize 128 train;
-  3). Get a reasonable AP on VOC or coco.
-
-  Also, please fix tl val problem.
-
-  I found coco hard to make it converge, or AP boost easily.... Hard to make it fully trained.
-  I should also train a res2net18 lite model, try what will happen with it.
-
-- 2021.07.25:
-
-  We got a new record!! res2net50_v1d achieved a better result:
-
-  ```
-  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.649
-  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.980
-  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.796
-  Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.449
-  Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.654
-  Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.659
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.477
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.690
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.704
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.497
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.711
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.713
-  [07/25 09:46:38 d2.evaluation.coco_evaluation]: Evaluation results for bbox:
-  |   AP   |  AP50  |  AP75  |  APs   |  APm   |  APl   |
-  |:------:|:------:|:------:|:------:|:------:|:------:|
-  | 64.912 | 97.988 | 79.599 | 44.884 | 65.412 | 65.926 |
-  [07/25 09:46:38 d2.evaluation.coco_evaluation]: Per-category bbox AP:
-  | category   | AP     | category   | AP     |
-  |:-----------|:-------|:-----------|:-------|
-  | face       | 64.081 | face_mask  | 65.743 |
-  [07/25 09:46:38 d2.engine.defaults]: Evaluation results for facemask_val in csv format:
-  [07/25 09:46:38 d2.evaluation.testing]: copypaste: Task: bbox
-  [07/25 09:46:38 d2.evaluation.testing]: copypaste: AP,AP50,AP75,APs,APm,APl
-  [07/25 09:46:38 d2.evaluation.testing]: copypaste: 64.9119,97.9876,79.5991,44.8837,65.4117,65.9264
-  ```
-
-  mAP 64! above res50 a lot!
-
-- 2021.07.22:
-  
-  Why resnet doesn't work at all??? Even CSP-Darknet not works very well.
-  Same config with facemask_cspdarknet53, I train another r50 backbone. 
-
-  r50 v7 loss  get a better AP:
-
-  ```
-  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.599
-  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.963
-  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.688
-  Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.386
-  Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.608
-  Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.609
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.449
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.646
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.658
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.420
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.667
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.667
-  [07/23 17:55:58 d2.evaluation.coco_evaluation]: Evaluation results for bbox:
-  |   AP   |  AP50  |  AP75  |  APs   |  APm   |  APl   |
-  |:------:|:------:|:------:|:------:|:------:|:------:|
-  | 59.950 | 96.304 | 68.820 | 38.601 | 60.775 | 60.892 |
-  [07/23 17:55:58 d2.evaluation.coco_evaluation]: Per-category bbox AP:
-  | category   | AP     | category   | AP     |
-  |:-----------|:-------|:-----------|:-------|
-  | face       | 58.198 | face_mask  | 61.702 |
-  [07/23 17:55:58 d2.engine.defaults]: Evaluation results for facemask_val in csv format:
-  [07/23 17:55:58 d2.evaluation.testing]: copypaste: Task: bbox
-  [07/23 17:55:58 d2.evaluation.testing]: copypaste: AP,AP50,AP75,APs,APm,APl
-  [07/23 17:55:58 d2.evaluation.testing]: copypaste: 59.9499,96.3041,68.8203,38.6014,60.7751,60.8919
-  [07/23 17:55:58 d2.utils.events]:  eta: 23:19:17  iter: 119999  total_loss: 2.492  loss_box: 0.3664
-  ```
-  mAP 59.95! I found this AP might because of resnet50 output channels are: 512, 1024, 2048,
-  while dakrnet are 256, 512, 1024  
-
-  I try reduce channel to 256 for resnet:
-  input 512: 
-  ```
-  res3 torch.Size([1, 256, 64, 64])
-  res4 torch.Size([1, 512, 32, 32])
-  res5 torch.Size([1, 1024, 16, 16])
-  ```
-  same as darknet.
-
-  **I found res50 can not be train without pretrained model. Also the channel output can not be changed**.
-
-
-- 2021.07.21:
-  
-  I found the way I using ciou has bug, it can not benifit model performance at all.
-  Debugging on it...
-
-  Got the first result with ciou correct:
-
-  ```
-  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.494
-  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.909
-  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.476
-  Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.269
-  Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.523
-  Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.498
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.386
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.567
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.578
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.308
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.598
-  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.584
-  [07/22 09:07:35 d2.evaluation.coco_evaluation]: Evaluation results for bbox:
-  |   AP   |  AP50  |  AP75  |  APs   |  APm   |  APl   |
-  |:------:|:------:|:------:|:------:|:------:|:------:|
-  | 49.372 | 90.876 | 47.579 | 26.898 | 52.327 | 49.841 |
-  [07/22 09:07:35 d2.evaluation.coco_evaluation]: Per-category bbox AP:
-  | category   | AP     | category   | AP     |
-  |:-----------|:-------|:-----------|:-------|
-  | face       | 51.021 | face_mask  | 47.723 |
-  ```
-  mAP 49.3! 
-
-- 2021.07.19:
-  
-  Going test the this 2 trick can work or not:
-  1. ciou loss;
-  2. mosiac augmentation;
-  3. larger lr better results?
-  4. r50-fpn output channel set to 1024 gain improvements?
-
-  lr too large make coco hard to converge, actually it is too big for the first serveral experiments. now try:
-
-  1. cspdarknet: be better accuracy;
-  2. Does ciou work?
-  3. Try YOLOX head design, darknet + pafpn head design;
-   
-
-
-- 2021.07.07: coco-r50-pan seems work now. with all augmentation open. but trafficlight not work seems center point were shifted. (Problem solved)
-  a. tl center shifted, is it anchor reason or something?
-  b. mosiac augmentation actually works;
-  c. 
-
-  Above problem solved mainly by 2 reasons:
-  1. the `demo.py` preprocess step not correctly aligned;
-  2. the IGNOR_THRESHOLD set too low, this will effect training badly.
\ No newline at end of file
diff --git a/readme.md b/readme.md
old mode 100755
new mode 100644
index 750432c..55f7ff5
--- a/readme.md
+++ b/readme.md
@@ -1,24 +1,57 @@
-# YOLOv7 - Beyond Detection
 
-![](https://z3.ax1x.com/2021/09/08/hHPhUx.png)
-![](https://z3.ax1x.com/2021/09/08/hHPIPK.png)
-![](https://z3.ax1x.com/2021/09/08/hHP7xe.png)
+<div align="center">
 
+<img src="https://s4.ax1x.com/2022/02/01/Hk2dtP.png">
 
-> This is the first and only (for now) **`YOLO family variant with transformers!`** and more advanced YOLO with multi-tasking such as detect & segmentation at the same time!
+<h1>YOLOv7 - Make YOLO Great Again</h1>
 
 
+[Documentation](https://github.com/jinfagang/yolov7) •
+[Installation Instructions](https://github.com/jinfagang/yolov7) •
+[Deployment](#deploy) •
+[Contributing](.github/CONTRIBUTING.md) •
+[Reporting Issues](https://github.com/jinfagang/yolov7/issues/new?assignees=&labels=&template=bug-report.yml)
 
-Just another yolo variant implemented based on **`detectron2`**. Be note that **YOLOv7 doesn't meant to be a successor of yolo family, 7 is just my magic and lucky number**. In our humble opinion, a good opensource project must have these features:
 
-- It must be reproduceble;
-- It must be simple and understandable;
-- It must be build with the weapon of the edge;
-- It must have a good maintainance, listen to the voice from community;
+[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/yolort)](https://pypi.org/project/alfred-py/)
+[![PyPI downloads](https://static.pepy.tech/personalized-badge/alfred-py?period=total&units=international_system&left_color=grey&right_color=blue&left_text=pypi%20downloads)](https://pepy.tech/project/yolort)
+[![Github downloads](https://img.shields.io/github/downloads/jinfagang/yolov7/total?color=blue&label=downloads&logo=github&logoColor=lightgrey)](https://img.shields.io/github/downloads/jinfagang/yolov7/total?color=blue&label=Downloads&logo=github&logoColor=lightgrey)
 
-However, we found many opensource detection framework such as YOLOv5, Efficientdet have their own weakness, for example, YOLOv5 is very good at reproduceable but really over-engineered, too many messy codes. What's more surprisingly, there were at least 20+ different version of re-implementation of YOLOv3-YOLOv4 in pytorch, 99.99% of them were totally **wrong**, either can u train your dataset nor make it mAP comparable with origin paper.(However, *doesn't mean this work is totally right, use at your own risk*.)
+[![codecov](https://codecov.io/gh/zhiqwang/yolov5-rt-stack/branch/main/graph/badge.svg?token=1GX96EA72Y)](https://codecov.io/gh/zhiqwang/yolov5-rt-stack)
+[![license](https://img.shields.io/github/license/zhiqwang/yolov5-rt-stack?color=dfd)](LICENSE)
+[![Slack](https://img.shields.io/badge/slack-chat-aff.svg?logo=slack)](https://join.slack.com/t/yolort/shared_invite/zt-mqwc7235-940aAh8IaKYeWclrJx10SA)
+[![PRs Welcome](https://img.shields.io/badge/PRs-welcome-pink.svg)](https://github.com/jinfagang/yolov7/issues?q=is%3Aopen+is%3Aissue+label%3A%22help+wanted%22)
 
-That's why we have this project! It's much more simpler to experiment different ARCH of YOLO build upon detectron2 with YOLOv7! Most importantly, more and more decent YOLO series model merged into this repo such as YOLOX (most decent in 2021). We also **welcome any trick/experiment PR on YOLOv7, help us build it better and stronger!!**. Please **star it and fork it right now!**.
+</div>
+
+## Migration Warning!
+
+Since someone else created another YOLOv7 **after** us, We don't want make people messed up with 2 of them, **Also we don't want chasing the meaningless AP number as sort of stunts**. So We plan to move further development of YOLOv7 into new place -> [YOLOvn link](https://github.com/jinfagang/yolovn). **new famework will keep development forever!!** These unfinished PRs will merge then start migrate. Thanks for everyone's contribution! Again, new framework is not only for re-implement SOTA models but also exploring new model design, **we are not only exploring detection, but also multi-tasking and new transformer arch design**.
+
+
+
+> In short: **YOLOv7 added instance segmentation to YOLO arch**. Also many transformer backbones, archs included. If you look carefully, you'll find our ultimate vision is to **make YOLO great again** by the power of **transformers**, as well as **multi-tasks training**. YOLOv7 achieves mAP 43, AP-s exceed MaskRCNN by 10 with a convnext-tiny backbone while simillar speed with YOLOX-s, more models listed below, it's more accurate and even more lighter!
+
+> GPU resources wanted! yolov7 next version is up-coming, however, I didn't have enough GPU to train pretrained models for everyone, if you have GPUs, please fire a discussion and ping me, I will guide to train new models.
+
+Thanks for Aarohi's youtube vlog for guaidance of yolov7: https://www.youtube.com/watch?v=ag88beS_fvM , if you want a quick start, take a look at this nice introduction on yolov7 and detectron2.
+
+For someone who still said we shouldn't name yolov7, here is the clarify: We create repo much much more earlier than someone else's paper, we also don't want make you confuse, but as we said, we take this name long long time ago. Besides, our yolov7 is a framework, whole **modeling is very intuitive** not like yolov5's yml config model way, it's pure in python all at your control. And inside yolov7, we supported a huge range of combination such as YOLOX, YOLOX-Lite, YOLOX-Mask, YOLOX-Keypoint, YOLOv6 Head, YOLOv4, Mosiac Augmentation etc. **Using which framework is at your choice, please stop bothering us at naming, please take a look at the create repo time screenshot below**. **WE ARE EXISTED ALREADY LAST YEAR**.
+![](https://raw.githubusercontent.com/jinfagang/public_images/master/20220718184052.png)
+
+## New version will release!
+
+**YOLOv7** v2.0 will be released soon! We will release our Convext-tiny YOLO arch model achieves mAP 43.9 with very low latency! Feature will be included in next version:
+
+- Support EfficientFormer backbone;
+- Support new YOLO2Go model, more lighter, much more faster and much more accurate;
+- Support MobileOne backbone;
+
+For more details, refer to [read the doc](https://yolov7.readthedocs.io/en/latest).
+
+Just **fork and star!**, you will be noticed once we release the new version!
+
+🔥🔥🔥 Just another yolo variant implemented based on **`detectron2`**. But note that **YOLOv7 isn't meant to be a successor of yolo family, 7 is just a magic and lucky number. Instead, YOLOv7 extends yolo into many other vision tasks, such as instance segmentation, one-stage keypoints detection etc.**.
 
 The supported matrix in YOLOv7 are:
 
@@ -41,52 +74,148 @@ The supported matrix in YOLOv7 are:
 - [x] YOLOv7 with Res2Net-v1d backbone, we **found res2net-v1d** have a better accuracy then darknet53;
 - [x] Added PPYOLOv2 PAN neck with SPP and dropblock;
 - [x] YOLOX arch added, now you can train YOLOX model (**anchor free yolo**) as well;
-- [ ] DETR: transformer based detection model and **onnx export supported, as well as TensorRT acceleration**;
+- [x] DETR: transformer based detection model and **onnx export supported, as well as TensorRT acceleration**;
+- [x] AnchorDETR: Faster converge version of detr, now supported!
+- [x] Almost all models can export to onnx;
+- [x] Supports TensorRT deployment for DETR and other transformer models;
+- [ ] It will integrate with [wanwu](https://github.com/jinfagang/wanwu_release), a torch-free deploy framework run fastest on your target platform.
+
+
+> ⚠️ Important note: **YOLOv7 on Github not the latest version, many features are closed-source but you can get it from https://manaai.cn**
+
+Features are ready but not opensource yet:
+
+- [x] Convnext training on YOLOX, higher accuracy than original YOLOX;
+- [x] GFL loss support;
+- [x] **MobileVit-V2** backbone available;
+- [x] CSPRep-Resnet: a repvgg style resnet used in PP-YOLOE but in pytorch rather than paddle;
+- [ ] VitDet support;
+- [ ] Simple-FPN support from VitDet;
+- [ ] PP-YOLOE head supported;
+
+If you want get full version YOLOv7, either **become a contributor** or get from https://manaai.cn .
+
+
+## 🆕 News!
+
+- ***2022.07.26***: Now we are preparing release new pose model;
+- ***2022.06.25***: Meituan's YOLOv6 training has been supported in YOLOv7!
+- ***2022.06.13***: New model **YOLOX-Convnext-tiny** got a ~~41.3~~ 43 mAP beats yolox-s, AP-small even higher!;
+- ***2022.06.09***: **GFL**, general focal loss supported;
+- ***2022.05.26***: Added **YOLOX-ConvNext** config;
+- ***2022.05.18***: DINO, DNDetr and DABDetr are about added, new records on coco up to 63.3 AP!
+- ***2022.05.09***: Big new function added! **We adopt YOLOX with Keypoints Head!**, model still under train, but you can check at code already;
+- ***2022.04.23***: We finished the int8 quantization on SparseInst! It works perfect! Download the onnx try it our by your self.
+- ***2022.04.15***: Now, we support the `SparseInst` onnx expport!
+- ***2022.03.25***: New instance seg supported! 40 FPS @ 37 mAP!! Which is fast;
+- ***2021.09.16***: First transformer based DETR model added, will explore more DETR series models;
+- ***2021.08.02***: **YOLOX** arch added, you can train YOLOX as well in this repo;
+- ***2021.07.25***: We found **YOLOv7-Res2net50** beat res50 and darknet53 at same speed level! 5% AP boost on custom dataset;
+- ***2021.07.04***: Added YOLOF and we can have a anchor free support as well, YOLOF achieves a better trade off on speed and accuracy;
+- ***2021.06.25***: this project first started.
+- more
 
 
 
-## Rules
+## 🌹 Contribution Wanted
 
-There are some rules you must follow to if you want train on your own dataset:
+If you have spare time or if you have GPU card, then help YOLOv7 become more stronger! Here is the guidance of contribute:
 
-- Rule No.1: Always set your own anchors on your dataset, using `tools/compute_anchors.py`, this applys to any other anchor-based detection methods as well (EfficientDet etc.);
-- Rule No.2: Keep a faith on your loss will goes down eventually, if not, dig deeper to find out why (but do not post issues repeated caused I might don't know either.).
-- Rule No.3: No one will tells u but it's real: *do not change backbone easily, whole params coupled with your backbone, dont think its simple as you think it should be*, also a Deeplearning engineer **is not an easy work as you think**, the whole knowledge like an ocean, and your knowledge is just a tiny drop of water...
-- Rule No.4: **must** using pretrain weights for **transoformer based backbone**, otherwise your loss will bump;
+1. **`Claim task`**: I have some ideas but do not have enough time to do it, if you want to implement it, claim the task, **I will give u detailed advise on how to do, and you can learn a lot from it**;
+2. **`Test mAP`**: When you finished new idea implementation, create a thread to report experiment mAP, if it work, then merge into our main master branch;
+3. **`Pull request`**: YOLOv7 is open and always tracking on SOTA and **light** models, if a model is useful, we will merge it and deploy it, distribute to all users want to try.
 
-Make sure you have read **rules** before ask me any questions.
+Here are some tasks need to be claimed:
 
+- [ ] VAN: Visual Attention Network, [paper](https://arxiv.org/abs/2202.09741), [VAN-Segmentation](https://github.com/Visual-Attention-Network/VAN-Segmentation), it was better than Swin and PVT and DeiT:
+  - [ ] D2 VAN backbone integration;
+  - [ ] Test with YOLOv7 arch;
+- [ ] ViDet: [code](https://github.com/naver-ai/vidt), this provides a realtime detector based on transformer, Swin-Nano mAP: 40, while 20 FPS, it can be integrated into YOLOv7;
+  - [ ] Integrate into D2 backbone, remove MSAtten deps;
+  - [ ] Test with YOLOv7 or DETR arch;
+- [ ] DINO: 63.3mAP highest in 2022 on coco.
+  - [ ] Code for [DINO](https://arxiv.org/abs/2203.03605) is avaliable [here](https://github.com/IDEACVR/DINO).
+- [x] ConvNext: https://github.com/facebookresearch/ConvNeXt, combined convolution and transformer.
+- [ ] NASVit: https://github.com/facebookresearch/NASViT
+- [ ] MobileVIT: https://github.com/apple/ml-cvnets/blob/main/cvnets/models/classification/mobilevit.py
+- [ ] DAB-DETR: https://github.com/IDEA-opensource/DAB-DETR, WIP
+- [ ] DN-DETR: https://github.com/IDEA-opensource/DN-DETR
+- [ ] EfficientNetV2: https://github.com/jahongir7174/EfficientNetV2
 
+Just join our in-house contributor plan, you can share our newest code with your contribution!
 
-## News!
 
-- **2021.09.16**: First transformer based DETR model added, will explore more DETR series models;
-- **2021.08.02**: **YOLOX** arch added, you can train YOLOX as well in this repo;
-- **2021.07.25**: We found **YOLOv7-Res2net50** beat res50 and darknet53 at same speed level! 5% AP boost on custom dataset;
-- **2021.07.04**: Added YOLOF and we can have a anchor free support as well, YOLOF achieves a better trade off on speed and accuracy;
-- **2021.06.25**: this project first started.
-- more
+## 💁‍♂️ Results
 
+| YOLOv7 Instance             |  Face & Detection |
+:-------------------------:|:-------------------------:
+![](https://z3.ax1x.com/2021/09/08/hHPhUx.png)  |  ![](https://z3.ax1x.com/2021/07/19/WGVhlj.png)
+![](https://z3.ax1x.com/2021/09/08/hHP7xe.png)  |  ![](https://z3.ax1x.com/2021/07/22/WDr5V0.png)
+![](https://s1.ax1x.com/2022/03/25/qN5zp6.png)  |  ![](https://s2.loli.net/2022/03/25/MBwq9YT7zC5Sd1A.png)
+![](https://s1.ax1x.com/2022/05/09/OJnXjI.png)  |  ![](https://s1.ax1x.com/2022/05/09/OJuuUU.png)
+![](https://raw.githubusercontent.com/jinfagang/public_images/master/20220613110908.png) | ![](https://raw.githubusercontent.com/jinfagang/public_images/master/20220613111122.png)
+![](https://raw.githubusercontent.com/jinfagang/public_images/master/20220613111139.png) | ![](https://raw.githubusercontent.com/jinfagang/public_images/master/20220613111239.png)
+![](https://s2.loli.net/2022/07/26/1Msgxupz4VWboqX.png) | ![](https://s2.loli.net/2022/07/26/NzahO46qcsU52Gn.png)
+![](https://raw.githubusercontent.com/jinfagang/public_images/master/20220801102656.png) | ![](https://raw.githubusercontent.com/jinfagang/public_images/master/20220801102828.png)
 
 
-## Train
 
-For training, quit simple, same as detectron2:
+## 🧑‍🦯 Installation && Quick Start
 
-```
-python train_net.py --config-file configs/coco/darknet53.yaml --num-gpus 8
-```
+- See [docs/install.md](docs/install.md)
 
-If you want train YOLOX, you can using config file `configs/coco/yolox_s.yaml`. All support arch are:
+Special requirements (other version may also work, but these are tested, with best performance, including ONNX export best support):
 
-- **YOLOX**: anchor free yolo;
-- **YOLOv7**: traditional yolo with some explorations, mainly focus on loss experiments;
-- **YOLOv7P**: traditional yolo merged with decent arch from YOLOX;
-- **YOLOMask**: arch do detection and segmentation at the same time (tbd);
-- **YOLOInsSeg**: instance segmentation based on YOLO detection (tbd);
+- torch 1.11 (stable version)
+- onnx
+- onnx-simplifier 0.3.7
+- alfred-py latest
+- detectron2 latest
+
+If you using lower version torch, onnx exportation might not work as our expected.
+
+
+
+## 🤔 Features
+
+Some highlights of YOLOv7 are:
 
+- A simple and standard training framework for any detection && instance segmentation tasks, based on detectron2;
+- Supports DETR and many transformer based detection framework out-of-box;
+- Supports easy to deploy pipeline thought onnx.
+- **This is the only framework support YOLOv4 + InstanceSegmentation** in single stage style;
+- Easily plugin into transformers based detector;
 
-## Demo
+We are strongly recommend you send PR if you have any further development on this project, **the only reason for opensource it is just for using community power to make it stronger and further**. It's very welcome for anyone contribute on any features!
+
+## 🧙‍♂️ Pretrained Models
+
+| model | backbone | input | aug | AP<sup>val</sup> |  AP  | FPS | weights |
+| :---- | :------  | :---: | :-: |:--------------: | :--: | :-: | :-----: |
+| [SparseInst](configs/sparse_inst_r50_base.yaml) | [R-50]() | 640 | &#x2718; | 32.8 | - | 44.3 | [model](https://drive.google.com/file/d/12RQLHD5EZKIOvlqW3avUCeYjFG1NPKDy/view?usp=sharing) |
+| [SparseInst](sparse_inst_r50vd_base.yaml) | [R-50-vd](https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet50d_ra2-464e36ba.pth) | 640 | &#x2718; | 34.1 | - | 42.6 | [model]()|
+| [SparseInst (G-IAM)](configs/sparse_inst_r50_giam.yaml) | [R-50]() | 608 | &#x2718; | 33.4 | - | 44.6 | [model](https://drive.google.com/file/d/1pXU7Dsa1L7nUiLU9ULG2F6Pl5m5NEguL/view?usp=sharing) |
+| [SparseInst (G-IAM)](configs/sparse_inst_r50_giam_aug.yaml) | [R-50]() | 608 | &#10003; | 34.2 | 34.7 | 44.6 | [model](https://drive.google.com/file/d/1MK8rO3qtA7vN9KVSBdp0VvZHCNq8-bvz/view?usp=sharing) |
+| [SparseInst (G-IAM)](configs/sparse_inst_r50_dcn_giam_aug.yaml) | [R-50-DCN]() | 608 | &#10003;| 36.4 | 36.8 | 41.6 | [model](https://drive.google.com/file/d/1qxdLRRHbIWEwRYn-NPPeCCk6fhBjc946/view?usp=sharing) |
+| [SparseInst (G-IAM)](configs/sparse_inst_r50vd_giam_aug.yaml) | [R-50-vd](https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet50d_ra2-464e36ba.pth) | 608 | &#10003;| 35.6 | 36.1 | 42.8| [model](https://drive.google.com/file/d/1dlamg7ych_BdWpPUCuiBXbwE0SXpsfGx/view?usp=sharing) |
+| [SparseInst (G-IAM)](configs/sparse_inst_r50vd_dcn_giam_aug.yaml) | [R-50-vd-DCN](https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet50d_ra2-464e36ba.pth) | 608 | &#10003; | 37.4 | 37.9 | 40.0  | [model](https://drive.google.com/file/d/1clYPdCNrDNZLbmlAEJ7wjsrOLn1igOpT/view?usp=sharing)|
+| [SparseInst (G-IAM)](sparse_inst_r50vd_dcn_giam_aug.yaml) | [R-50-vd-DCN](https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet50d_ra2-464e36ba.pth) | 640 | &#10003; | 37.7 | 38.1 | 39.3 |  [model](https://drive.google.com/file/d/1clYPdCNrDNZLbmlAEJ7wjsrOLn1igOpT/view?usp=sharing)|
+| SparseInst Int8 onnx | [google drive](https://drive.google.com/file/d/1FYn_0p3RXzKaTGzTfdiJI1YhAexA_V3s/view?usp=sharing) |
+
+
+
+## 🧙‍♂️ Models trained in YOLOv7
+
+| model | backbone | input | aug | AP |  AP50 |  APs  | FPS | weights |
+| :---- | :------  | :---: | :-: |:---: | :--: | :-: | :-: | :-----: |
+| [YoloFormer-Convnext-tiny](configs/coco/yolotr/yolotr_convnext.yaml) | [Convnext-tiny](https://dl.fbaipublicfiles.com/convnext/convnext_small_22k_224.pth) | 800 | &#10003; | 43 | 63.7 | 26.5 | 39.3 |  [model](https://drive.google.com/file/d/1bTedWQaENvlFknqyQreBKA1HoAMOtHkn/view?usp=sharing)|
+| [YOLOX-s](configs/coco/yolox_s.yaml) | - | 800 | &#10003; | 40.5 | - | - | 39.3 |  [model](https://drive.google.com/file/d/1clYPdCNrDNZLbmlAEJ7wjsrOLn1igOpT/view?usp=sharing)|
+
+> note: We post AP-s here because we want to know how does small object performance in related model, it was notablely higher small-APs for transformer backbone based model! **Some of above model might not opensourced but we provide weights**.
+
+
+
+## 🥰 Demo
 
 Run a quick demo would be like:
 
@@ -94,85 +223,130 @@ Run a quick demo would be like:
 python3 demo.py --config-file configs/wearmask/darknet53.yaml --input ./datasets/wearmask/images/val2017 --opts MODEL.WEIGHTS output/model_0009999.pth
 ```
 
+Run a quick demo to upload and explore your YOLOv7 prediction with [Weights & Biases](https://wandb.ai/site)
+. [See here for an example](https://wandb.ai/parambharat/yolov7)
+
+```
+python3 demo.py --config-file configs/wearmask/darknet53.yaml --input ./datasets/wearmask/images/val2017 --wandb-entity <your-username/team> --wandb-project <project-name> --opts MODEL.WEIGHTS output/model_0009999.pth
+```
+
+Run SparseInst:
+
+```
+python demo.py --config-file configs/coco/sparseinst/sparse_inst_r50vd_giam_aug.yaml --video-input ~/Movies/Videos/86277963_nb2-1-80.flv -c 0.4 --opts MODEL.WEIGHTS weights/sparse_inst_r50vd_giam_aug_8bc5b3.pth
+```
+
 **an update based on detectron2 newly introduced LazyConfig system, run with a LazyConfig model using**:
 
 ```
 python3 demo_lazyconfig.py --config-file configs/new_baselines/panoptic_fpn_regnetx_0.4g.py --opts train.init_checkpoint=output/model_0004999.pth
 ```
 
+## 😎 Train
 
-## Export ONNX && TensorRTT && TVM
+For training, quite simple, same as detectron2:
 
-1. `detr`:
+```
+python train_net.py --config-file configs/coco/darknet53.yaml --num-gpus 8
+```
 
-  ```
-  
-  ```
+If you want train YOLOX, you can using config file `configs/coco/yolox_s.yaml`. All support arch are:
+
+- **YOLOX**: anchor free yolo;
+- **YOLOv7**: traditional yolo with some explorations, mainly focus on loss experiments;
+- **YOLOv7P**: traditional yolo merged with decent arch from YOLOX;
+- **YOLOMask**: arch do detection and segmentation at the same time (tbd);
+- **YOLOInsSeg**: instance segmentation based on YOLO detection (tbd);
+
+
+## 😎 Rules
+
+There are some rules you must follow to if you want train on your own dataset:
 
-## More Advanced YOLO
+- Rule No.1: Always set your own anchors on your dataset, using `tools/compute_anchors.py`, this applys to any other anchor-based detection methods as well (EfficientDet etc.);
+- Rule No.2: Keep a faith on your loss will goes down eventually, if not, dig deeper to find out why (but do not post issues repeated caused I might don't know either.).
+- Rule No.3: No one will tells u but it's real: *do not change backbone easily, whole params coupled with your backbone, dont think its simple as you think it should be*, also a Deeplearning engineer **is not an easy work as you think**, the whole knowledge like an ocean, and your knowledge is just a tiny drop of water...
+- Rule No.4: **must** using pretrain weights for **transoformer based backbone**, otherwise your loss will bump;
 
-Here we show some highlights on multi-tasking:
+Make sure you have read **rules** before ask me any questions.
 
 
+## 🔨 Export ONNX && TensorRTT && TVM
 
-## Performance
+1. `detr`:
 
-Here is a dedicated performance compare with other packages. 
+  ```
+  python export.py --config-file detr/config/file
+  ```
 
+  this works has been done, inference script included inside `tools`.
 
+2. `AnchorDETR`:
 
-## Some Tiny Object Datasets supported
+  anchorDETR also supported training and exporting to ONNX.
 
-- **Wearmask**:
-  support VOC, Yolo, coco 3 format. You can using coco format here. Download from: 链接: https://pan.baidu.com/s/1ozAgUFLqfTXLp-iOecddqQ 提取码: xgep . Using `configs/wearmask` to train this dataset.
-- **more**:
-  to go.
+3. `SparseInst`:
+   Sparsinst already supported exporting to onnx!!
+
+  ```
+  python export.py --config-file configs/coco/sparseinst/sparse_inst_r50_giam_aug.yaml --video-input ~/Videos/a.flv  --opts MODEL.WEIGHTS weights/sparse_inst_r50_giam_aug_2b7d68.pth INPUT.MIN_SIZE_TEST 512
+  ```
+  If you are on a CPU device, please using:
+  ```
+  python export.py --config-file configs/coco/sparseinst/sparse_inst_r50_giam_aug.yaml --input images/COCO_val2014_000000002153.jpg --verbose  --opts MODEL.WEIGHTS weights/sparse_inst_r50_giam_aug_2b7d68.pth MODEL.DEVICE 'cpu'
+  ```
+  Then you can have `weights/sparse_inst_r50_giam_aug_2b7d68_sim.onnx` generated, this onnx can be inference using ORT without any unsupported ops.
 
 
 
-## Detection Results
+## 🤒️ Performance
 
-![](https://z3.ax1x.com/2021/07/22/WDs9PO.png)
-![](https://z3.ax1x.com/2021/07/22/WDr5V0.png)
-![](https://z3.ax1x.com/2021/07/19/WGVhlj.png)
-![](https://z3.ax1x.com/2021/07/26/WWBxi9.png)
+Here is a dedicated performance compare with other packages.
 
+tbd.
 
 
 
+## 🪜 Some Tiny Object Datasets supported
+
+- **Wearmask**:
+  support VOC, Yolo, coco 3 format. You can using coco format here. Download from: 链接: https://pan.baidu.com/s/1ozAgUFLqfTXLp-iOecddqQ 提取码: xgep . Using `configs/wearmask` to train this dataset.
+- **more**:
+  to go.
 
 
-## Some Exp Visualizations
 
-1. GridMask
+## 👋 Detection Results
 
-   ![](https://z3.ax1x.com/2021/06/27/RYeJkd.png)
-   ![](https://z3.ax1x.com/2021/07/06/Roj5dg.png)
+| Image             |  Detections |
+:-------------------------:|:-------------------------:
+![](https://z3.ax1x.com/2021/07/22/WDs9PO.png) | ![](https://z3.ax1x.com/2021/07/22/WDr5V0.png)
+![](https://z3.ax1x.com/2021/07/19/WGVhlj.png) | ![](https://z3.ax1x.com/2021/07/26/WWBxi9.png)
 
-   Our GridMask augmentation also supports 2 modes.
 
 
 
-2. Mosaic
+## 😯 Dicussion Group
 
-   ![](https://z3.ax1x.com/2021/07/06/RIX1iR.png)
-   ![](https://z3.ax1x.com/2021/07/06/Roq97d.png)
+| Wechat             |  QQ |
+:-------------------------:|:-------------------------:
+![image.png](https://s2.loli.net/2022/03/14/9uxaEnDA6vdByr2.png)  |  ![image.png](https://s2.loli.net/2022/02/28/C4gjf6DcwdHvnO8.png)
 
-   Our Mosaic support any size and any any image numbers!
+* if wechat expired, please contact me update via github issue. group for general discussion, not only for yolov7.
 
-   **new**:
-   we merged another mosiac implementation from YOLOX, this version will do random pespective:
+## 🀄️ Some Exp Visualizations
 
-   ![](https://z3.ax1x.com/2021/08/06/futTte.png)
-   ![](https://z3.ax1x.com/2021/08/06/futv0f.png)
-   ![](https://z3.ax1x.com/2021/08/07/fKEPvd.png)
 
+| GridMask             |  Mosaic |
+:-------------------------:|:-------------------------:
+![](https://z3.ax1x.com/2021/06/27/RYeJkd.png)  |  ![](https://z3.ax1x.com/2021/07/06/RIX1iR.png)
+![](https://z3.ax1x.com/2021/07/06/Roj5dg.png) | ![](https://z3.ax1x.com/2021/07/06/Roq97d.png)
+![](https://z3.ax1x.com/2021/08/06/futTte.png) | ![](https://z3.ax1x.com/2021/08/06/futv0f.png)
 
 
 
 
-## Some Phenomenon I don't Understand
 
-1. I found darknet-based YOLOv4 needs a so lower nms threshold (0.1 for me). I don't know why.
-2. I found resnet50-fpn based YOLOv4 can not get a better detection result than darknet.
+## ©️ License
 
+Code released under GPL license. Please pull request to this source repo before you make your changes public or commercial usage. All rights reserved by Lucas Jin.
diff --git a/requirements.txt b/requirements.txt
old mode 100755
new mode 100644
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..884adf5
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,47 @@
+import os
+import subprocess
+import time
+from setuptools import find_packages, setup
+import io
+from os import path
+
+
+this_directory = path.abspath(path.dirname(__file__))
+with io.open(path.join(this_directory, "readme.md"), encoding="utf-8") as f:
+    long_description = f.read()
+
+
+version_file = "yolov7/version.py"
+
+
+def get_version():
+    with open(version_file, "r") as f:
+        exec(compile(f.read(), version_file, "exec"))
+    return locals()["__version__"]
+
+
+if __name__ == "__main__":
+    setup(
+        name="yolov7",
+        version=get_version(),
+        description="YOLOv7 is a high-level training framework based on detectron2",
+        long_description="",
+        author="LucasJin",
+        author_email="jinfagang19@163.com",
+        keywords="computer vision, object detection",
+        url="https://github.com/jinfagang/yolov7",
+        packages=find_packages(exclude=("configs", "tools", "demo", "images")),
+        classifiers=[
+            "Development Status :: 4 - Beta",
+            "License :: OSI Approved :: Apache Software License",
+            "Operating System :: OS Independent",
+            "Programming Language :: Python :: 3",
+            "Programming Language :: Python :: 3.5",
+            "Programming Language :: Python :: 3.6",
+            "Programming Language :: Python :: 3.7",
+            "Programming Language :: Python :: 3.8",
+            "Programming Language :: Python :: 3.9",
+        ],
+        license="Apache License 2.0",
+        zip_safe=False,
+    )
diff --git a/tests.py b/tests.py
old mode 100755
new mode 100644
index 4177f44..d4ec4d9
--- a/tests.py
+++ b/tests.py
@@ -1,4 +1,26 @@
-from tests.test_backbone import test_backbone
+import torch
 
+def batched_index_select(input, dim, index):
+    views = [1 if i != dim else -1 for i in range(len(input.shape))]
+    expanse = list(input.shape)
+    expanse[dim] = -1
+    index = index.view(views).expand(expanse)
+    # making the first dim of output be B
+    return torch.cat(torch.chunk(torch.gather(input, dim, index), chunks=index.shape[0], dim=dim), dim=0)
 
-test_backbone()
\ No newline at end of file
+
+mask = torch.randn([3, 100, 224, 224])
+score = torch.randn(3 ,100)
+_, keep = torch.topk(score, 20)
+
+mask2 = mask.view(-1, 224, 224)
+keep2 = keep.view(-1, 20)
+
+a = mask2[keep2]
+
+score = score.view(-1)
+b = score[keep2]
+print(a.shape)
+print(b.shape)
+# print(keep)
+# print(mask)
diff --git a/export_torchscript.py b/tools/benchmark.py
old mode 100755
new mode 100644
similarity index 56%
rename from export_torchscript.py
rename to tools/benchmark.py
index 38635b9..ec4d343
--- a/export_torchscript.py
+++ b/tools/benchmark.py
@@ -5,6 +5,7 @@
 import os
 import time
 import cv2
+from detectron2.structures.masks import BitMasks
 from numpy.core.fromnumeric import sort
 import tqdm
 import torch
@@ -22,25 +23,11 @@
 
 from yolov7.config import add_yolo_config
 
-
-from alfred.vis.image.mask import label2color_mask, vis_bitmasks
+from alfred.vis.image.mask import label2color_mask, vis_bitmasks, vis_bitmasks_with_classes
 from alfred.vis.image.det import visualize_det_cv2_part, visualize_det_cv2_fancy
-from alfred.dl.torch.common import device
-
-
-"""
-this script used as export torchscript only.
-
-Not all models support torchscript export. Once it exported, you can using torchscript for 
-deployment or TVM accelerate.
 
-Command:
-
-python3 export_torchscript.py --config-file configs/coco/yolox_s.yaml --input ./images/COCO_val2014_000000002153.jpg --opts MODEL.WEIGHTS ./output/coco_yolox_s/model_final.pth
-
-"""
-
-torch.set_grad_enabled(False)
+# constants
+WINDOW_NAME = "COCO detections"
 
 
 class DefaultPredictor:
@@ -69,14 +56,15 @@ def __call__(self, original_image):
             height, width = original_image.shape[:2]
             image = self.aug.get_transform(
                 original_image).apply_image(original_image)
-            print('image after transform: ', image.shape)
             image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1))
             inputs = {"image": image, "height": height, "width": width}
             tic = time.time()
-            predictions = self.model([inputs])[0]
+            # predictions, pure_t = self.model([inputs])
+            predictions = self.model([inputs])
+            predictions = predictions[0]
             c = time.time() - tic
-            print('cost: {}, fps: {}'.format(c, 1/c))
-            return predictions
+            # print('cost: {}, fps: {}'.format(c, 1/c))
+            return predictions, image.shape
 
 
 def setup_cfg(args):
@@ -86,16 +74,18 @@ def setup_cfg(args):
     cfg.merge_from_file(args.config_file)
     cfg.merge_from_list(args.opts)
 
-    cfg.MODEL.YOLO.CONF_THRESHOLD = 0.3
-    cfg.MODEL.YOLO.NMS_THRESHOLD = 0.6
+    cfg.MODEL.YOLO.CONF_THRESHOLD = args.confidence_threshold
+    cfg.MODEL.YOLO.NMS_THRESHOLD = args.nms_threshold
     cfg.MODEL.YOLO.IGNORE_THRESHOLD = 0.1
 
-    cfg.INPUT.MIN_SIZE_TEST = 672  # 90ms
+    # cfg.INPUT.MIN_SIZE_TEST = 672  # 90ms
+    # cfg.INPUT.MIN_SIZE_TEST = 2560  # 90ms
+    # cfg.INPUT.MAX_SIZE_TEST = 3060  # 90ms
+    cfg.INPUT.MAX_SIZE_TEST = 900  # 90ms
     # cfg.INPUT.MIN_SIZE_TEST = 512 # 70ms
     # cfg.INPUT.MIN_SIZE_TEST = 1080  # 40ms
-    # cfg.INPUT.MAX_SIZE_TEST = 640 # 40ms
-    # cfg.INPUT.MAX_SIZE_TEST = 768 # 70ms
-    cfg.INPUT.MAX_SIZE_TEST = 1080  # 70ms
+    # cfg.INPUT.MAX_SIZE_TEST = 512 # 40ms
+    # cfg.INPUT.MAX_SIZE_TEST = 1080  # 70ms
     cfg.freeze()
     return cfg
 
@@ -115,7 +105,6 @@ def get_parser():
     parser.add_argument(
         "--input",
         # nargs="+",
-        default='./images/COCO_val2014_000000001722.jpg',
         help="A list of space separated input images; "
         "or a single glob pattern such as 'directory/*.jpg'",
     )
@@ -126,17 +115,16 @@ def get_parser():
     )
 
     parser.add_argument(
-        "--confidence-threshold",
+        '-c', "--confidence-threshold",
         type=float,
-        default=0.65,
+        default=0.21,
         help="Minimum score for instance predictions to be shown",
     )
     parser.add_argument(
-        "-v",
-        "--verbose",
-        default=False,
-        action='store_true',
-        help="verbose when onnx export",
+        '-n', "--nms-threshold",
+        type=float,
+        default=0.6,
+        help="Minimum score for instance predictions to be shown",
     )
     parser.add_argument(
         "--opts",
@@ -147,90 +135,56 @@ def get_parser():
     return parser
 
 
-def load_test_image(f, h, w):
-    a = cv2.imread(f)
-    a = cv2.resize(a, (w, h))
-    a_t = torch.tensor(a.astype(np.float32)).unsqueeze(0)
-    return a_t, a
-
-
-def load_test_image_detr(f, h, w):
-    """
-    detr do not using 
-    """
-    a = cv2.imread(f)
-    a = cv2.resize(a, (w, h))
-    a_t = torch.tensor(a.astype(np.float32)).permute(2, 0, 1).to(device)
-    return torch.stack([a_t,]), a
-    # return torch.stack([a_t, a_t]), a
-
-def detr_postprocess(out_boxes, ori_img):
-    """
-    normalized xyxy output
-    """
-    h, w, _ = ori_img.shape
-    out_boxes[..., 0] *= w
-    out_boxes[..., 1] *= h
-    out_boxes[..., 2] *= w
-    out_boxes[..., 3] *= h
-    return out_boxes
-
-
-def vis_res_fast(res, img, colors):
-    res = res[0].cpu().numpy()
-    scores = res[:, -2]
-    clss = res[:, -1]
-    bboxes = res[:, :4]
-
-    indices = scores > 0.6
-    bboxes = bboxes[indices]
-    scores = scores[indices]
-    clss = clss[indices]
-
-    img = visualize_det_cv2_part(
-        img, scores, clss, bboxes, force_color=colors, is_show=True)
-    # img = cv2.addWeighted(img, 0.9, m, 0.6, 0.9)
-    return img
-
-
 if __name__ == "__main__":
     mp.set_start_method("spawn", force=True)
     args = get_parser().parse_args()
     setup_logger(name="fvcore")
     logger = setup_logger()
     logger.info("Arguments: " + str(args))
-    assert os.path.isfile(
-        args.input), 'onnx export only support send a image file.'
 
     cfg = setup_cfg(args)
-    colors = [[random.randint(0, 255) for _ in range(3)]
-              for _ in range(cfg.MODEL.YOLO.CLASSES)]
 
     metadata = MetadataCatalog.get(cfg.DATASETS.TEST[0])
     predictor = DefaultPredictor(cfg)
 
-    h = 768
-    w = 960
-    # h = 640
-    # w = 640
-    # inp, ori_img = load_test_image(args.input, h, w)
-    inp, ori_img = load_test_image_detr(args.input, h, w)
-    print('input shape: ', inp.shape)
-    # inp = inp.to(torch.device('cuda'))
-
-    model = predictor.model
-    model = model.float()
-    model.onnx_export = True
-
-    ts_f = os.path.join(
-        'weights', os.path.basename(cfg.MODEL.WEIGHTS).split('.')[0] + '.torchscript.pt')
-    traced = torch.jit.trace(model, inp)
-    torch.jit.save(traced, ts_f)
-    logger.info('Model saved into: {}'.format(ts_f))
-
-    logger.info('test if torchscript export logic is right...')
-    model.onnx_vis = True
-    out = model(inp)
-    out = detr_postprocess(out, ori_img)
-    # detr postprocess
-    vis_res_fast(out, ori_img, colors=colors)
+    print(cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MAX_SIZE_TEST)
+    colors = [[random.randint(0, 255) for _ in range(3)]
+              for _ in range(cfg.MODEL.YOLO.CLASSES)]
+
+    if args.input:
+        if os.path.isdir(args.input):
+            print('Benchmark only support single image input.')
+        else:
+            t0 = time.time()
+            num_times = 200
+
+            img = cv2.imread(args.input)
+            a = img.shape
+            for i in range(num_times):
+                # print('ori img shape: ', img.shape)
+                res, a = predictor(img)
+            t1 = time.time()
+            print(f'Total time: {t1 -t0}\n'
+                  f'Average time: {(t1-t0)/num_times}\n'
+                  f'Input shape: {a}\n'
+                  f'Original shape: {img.shape}\n')
+
+    elif args.webcam:
+        print('Not supported.')
+    elif args.video_input:
+        video = cv2.VideoCapture(args.video_input)
+        width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        frames_per_second = video.get(cv2.CAP_PROP_FPS)
+        num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
+        basename = os.path.basename(args.video_input)
+
+        while(video.isOpened()):
+            ret, frame = video.read()
+            # frame = cv2.resize(frame, (640, 640))
+            res = predictor(frame)
+            # res = vis_res_fast(res, frame, metadata, colors)
+            # cv2.imshow('frame', res)
+            cv2.imshow('frame', res)
+            if cv2.waitKey(1) & 0xFF == ord('q'):
+                break
diff --git a/tools/cleandata.py b/tools/cleandata.py
new file mode 100644
index 0000000..c4b20ef
--- /dev/null
+++ b/tools/cleandata.py
@@ -0,0 +1,248 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import argparse
+import glob
+import multiprocessing as mp
+import os
+import time
+import shutil
+import cv2
+from detectron2.structures.masks import BitMasks
+from numpy.core.fromnumeric import sort
+import tqdm
+import torch
+import time
+import random
+from detectron2.data.detection_utils import read_image
+from detectron2.utils.logger import setup_logger
+
+import numpy as np
+from detectron2.data.catalog import MetadataCatalog
+from detectron2.config import get_cfg
+from detectron2.modeling import build_model
+import detectron2.data.transforms as T
+from detectron2.checkpoint import DetectionCheckpointer
+
+from yolov7.config import add_yolo_config
+
+
+from alfred.vis.image.mask import label2color_mask, vis_bitmasks, vis_bitmasks_with_classes
+from alfred.vis.image.det import visualize_det_cv2_part, visualize_det_cv2_fancy
+
+# constants
+WINDOW_NAME = "COCO detections"
+
+
+"""
+
+this script is for clean data purpose.
+
+it will load model and glob all image files under a directory,
+then will copy all images that not have any positive detections,
+then you can judge apply remove by names at your given directory
+
+"""
+
+
+class DefaultPredictor:
+
+    def __init__(self, cfg):
+        self.cfg = cfg.clone()  # cfg can be modified by model
+        self.model = build_model(self.cfg)
+        self.model.eval()
+        if len(cfg.DATASETS.TEST):
+            self.metadata = MetadataCatalog.get(cfg.DATASETS.TEST[0])
+
+        checkpointer = DetectionCheckpointer(self.model)
+        checkpointer.load(cfg.MODEL.WEIGHTS)
+
+        self.aug = T.ResizeShortestEdge(
+            [cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST
+        )
+
+        self.input_format = cfg.INPUT.FORMAT
+        assert self.input_format in ["RGB", "BGR"], self.input_format
+
+    def __call__(self, original_image):
+        with torch.no_grad():
+            if self.input_format == "RGB":
+                original_image = original_image[:, :, ::-1]
+            height, width = original_image.shape[:2]
+            image = self.aug.get_transform(
+                original_image).apply_image(original_image)
+            print('image after transform: ', image.shape)
+            image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1))
+            inputs = {"image": image, "height": height, "width": width}
+            tic = time.time()
+            # predictions, pure_t = self.model([inputs])
+            predictions = self.model([inputs])
+            predictions = predictions[0]
+            c = time.time() - tic
+            print('cost: {}, fps: {}'.format(c, 1/c))
+            return predictions
+
+
+def setup_cfg(args):
+    # load config from file and command-line arguments
+    cfg = get_cfg()
+    add_yolo_config(cfg)
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+
+    cfg.MODEL.YOLO.CONF_THRESHOLD = args.confidence_threshold
+    cfg.MODEL.YOLO.NMS_THRESHOLD = args.nms_threshold
+    cfg.MODEL.YOLO.IGNORE_THRESHOLD = 0.1
+
+    # cfg.INPUT.MIN_SIZE_TEST = 672  # 90ms
+    # cfg.INPUT.MIN_SIZE_TEST = 2560  # 90ms
+    # cfg.INPUT.MAX_SIZE_TEST = 3060  # 90ms
+    cfg.INPUT.MAX_SIZE_TEST = 900  # 90ms
+    # cfg.INPUT.MIN_SIZE_TEST = 512 # 70ms
+    # cfg.INPUT.MIN_SIZE_TEST = 1080  # 40ms
+    # cfg.INPUT.MAX_SIZE_TEST = 512 # 40ms
+    # cfg.INPUT.MAX_SIZE_TEST = 1080  # 70ms
+    cfg.freeze()
+    return cfg
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="Detectron2 demo for builtin configs")
+    parser.add_argument(
+        "--config-file",
+        default="configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml",
+        metavar="FILE",
+        help="path to config file",
+    )
+    parser.add_argument("--webcam", action="store_true",
+                        help="Take inputs from webcam.")
+    parser.add_argument("--video-input", help="Path to video file.")
+    parser.add_argument(
+        "--input",
+        # nargs="+",
+        help="A list of space separated input images; "
+        "or a single glob pattern such as 'directory/*.jpg'",
+    )
+    parser.add_argument(
+        "--output",
+        help="A file or directory to save output visualizations. "
+        "If not given, will show output in an OpenCV window.",
+    )
+
+    parser.add_argument(
+        '-c', "--confidence-threshold",
+        type=float,
+        default=0.45,
+        help="Minimum score for instance predictions to be shown",
+    )
+    parser.add_argument(
+        '-n', "--nms-threshold",
+        type=float,
+        default=0.6,
+        help="Minimum score for instance predictions to be shown",
+    )
+    parser.add_argument(
+        "--opts",
+        help="Modify config options using the command-line 'KEY VALUE' pairs",
+        default=[],
+        nargs=argparse.REMAINDER,
+    )
+    return parser
+
+
+def vis_res_fast(res, img, meta, colors):
+    ins = res['instances']
+    bboxes = ins.pred_boxes.tensor.cpu().numpy()
+    scores = ins.scores.cpu().numpy()
+    clss = ins.pred_classes.cpu().numpy()
+
+    if ins.has('pred_bit_masks'):
+        # img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        # img = np.stack((img,)*3, axis=-1)
+
+        bit_masks = ins.pred_bit_masks
+        if isinstance(bit_masks, BitMasks):
+            bit_masks = bit_masks.tensor.cpu().numpy()
+        # img = vis_bitmasks_with_classes(img, clss, bit_masks)
+        # img = vis_bitmasks_with_classes(img, clss, bit_masks, force_colors=colors, mask_border_color=(255, 255, 255), thickness=2)
+        img = vis_bitmasks_with_classes(
+            img, clss, bit_masks, force_colors=None, draw_contours=False)
+        # img = vis_bitmasks(img, bit_masks, thickness=2, draw_contours=False)
+        # img = vis_bitmasks(img, bit_masks, thickness=2, draw_contours=False, fill_mask=True)
+    # print('img shape: ', img.shape)
+    thickness = 1 if ins.has('pred_bit_masks') else 2
+    font_scale = 0.3 if ins.has('pred_bit_masks') else 0.4
+
+    is_no_obj = False
+    if bboxes.shape[0] == 0:
+        print('found an img without positive samples.')
+        is_no_obj = True
+    img = visualize_det_cv2_part(
+        img, scores, clss, bboxes, force_color=colors, line_thickness=thickness, font_scale=font_scale)
+    # img = cv2.addWeighted(img, 0.9, m, 0.6, 0.9)
+    return img, is_no_obj
+
+
+if __name__ == "__main__":
+    mp.set_start_method("spawn", force=True)
+    args = get_parser().parse_args()
+    setup_logger(name="fvcore")
+    logger = setup_logger()
+    logger.info("Arguments: " + str(args))
+
+    cfg = setup_cfg(args)
+
+    metadata = MetadataCatalog.get(cfg.DATASETS.TEST[0])
+    predictor = DefaultPredictor(cfg)
+
+    print(cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MAX_SIZE_TEST)
+    colors = [[random.randint(0, 255) for _ in range(3)]
+              for _ in range(cfg.MODEL.YOLO.CLASSES)]
+
+    if args.input:
+        if os.path.isdir(args.input):
+            p_d = os.path.dirname(args.input)
+            target_d = os.path.join(p_d, 'no_objects')
+            os.makedirs(target_d, exist_ok=True)
+            # glob all images recursive
+            imgs = glob.glob(os.path.join(args.input, '**/*.jpg'), recursive=True)
+            imgs = sorted(imgs)
+            # print(imgs)
+            for path in imgs:
+                if os.path.isfile(path):
+                    # use PIL, to be consistent with evaluation
+                    img = cv2.imread(path)
+                    # print('ori img shape: ', img.shape)
+                    res = predictor(img)
+                    res, is_no_obj = vis_res_fast(res, img, metadata, colors)
+                    # cv2.imshow('frame', res)
+                    cv2.imshow('frame', res)
+                    if is_no_obj:
+                        shutil.copy(path, os.path.join(target_d, os.path.basename(path)))
+                    if cv2.waitKey(1) & 0xFF == ord('q'):
+                        break
+        else:
+            img = cv2.imread(args.input)
+            res = predictor(img)
+            res = vis_res_fast(res, img, metadata, colors)
+            # cv2.imshow('frame', res)
+            cv2.imshow('frame', res)
+            cv2.waitKey(0)
+    elif args.webcam:
+        print('Not supported.')
+    elif args.video_input:
+        video = cv2.VideoCapture(args.video_input)
+        width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        frames_per_second = video.get(cv2.CAP_PROP_FPS)
+        num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
+        basename = os.path.basename(args.video_input)
+
+        while(video.isOpened()):
+            ret, frame = video.read()
+            # frame = cv2.resize(frame, (640, 640))
+            res = predictor(frame)
+            res = vis_res_fast(res, frame, metadata, colors)
+            # cv2.imshow('frame', res)
+            cv2.imshow('frame', res)
+            if cv2.waitKey(1) & 0xFF == ord('q'):
+                break
diff --git a/tools/compute_anchors.py b/tools/compute_anchors.py
old mode 100755
new mode 100644
diff --git a/tools/convert_anchordetr_to_d2.py b/tools/convert_anchordetr_to_d2.py
new file mode 100644
index 0000000..bf45076
--- /dev/null
+++ b/tools/convert_anchordetr_to_d2.py
@@ -0,0 +1,98 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Helper script to convert models trained with the main version of DETR to be used with the Detectron2 version.
+"""
+import json
+import argparse
+
+import numpy as np
+import torch
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("D2 model converter")
+
+    parser.add_argument("--source_model", default="", type=str, help="Path or url to the DETR model to convert")
+    parser.add_argument("--output_model", default="", type=str, help="Path where to save the converted model")
+    parser.add_argument("--variant", default="detr", type=str, help="detr or anchordetr")
+    parser.add_argument("--mask", action="store_true", help="mask or not")
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    # D2 expects contiguous classes, so we need to remap the 91 classes from DETR
+    # fmt: off
+    coco_idx = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+                27, 28, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51,
+                52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 67, 70, 72, 73, 74, 75, 76, 77,
+                78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90, 0,12,26,29,30,45,68,69,71,83]
+    # fmt: on
+
+    coco_idx = np.array(coco_idx)
+    va = args.variant
+
+    if args.source_model.startswith("https"):
+        checkpoint = torch.hub.load_state_dict_from_url(args.source_model, map_location="cpu", check_hash=True)
+    else:
+        checkpoint = torch.load(args.source_model, map_location="cpu")
+    model_to_convert = checkpoint["model"]
+
+    model_converted = {}
+    for k in model_to_convert.keys():
+        old_k = k
+        if "backbone" in k:
+            print(k)
+            k = k.replace("backbone.body.", "")
+            if "layer" not in k:
+                k = "stem." + k
+            for t in [1, 2, 3, 4]:
+                k = k.replace(f"layer{t}", f"res{t + 1}")
+            for t in [1, 2, 3]:
+                k = k.replace(f"bn{t}", f"conv{t}.norm")
+            k = k.replace("downsample.0", "shortcut")
+            k = k.replace("downsample.1", "shortcut.norm")
+            k = "backbone.backbone." + k
+        k = f"{va}." + k
+        print(old_k, "->", k)
+        if "class_embed" in old_k:
+            v = model_to_convert[old_k].detach()
+            print(v.shape)
+            if v.shape[0] == 91:
+                shape_old = v.shape
+                # a = torch.zeros_like(v)
+                # a[:82,] = v[coco_idx]
+                # b = a[:82,]
+                a = v[coco_idx]
+                v_sum = torch.sum(a[-10:], dim=0).unsqueeze(0)
+                print(v_sum.shape)
+                b = torch.cat([a[:-10], v_sum], dim=0)
+                # a[]
+                model_converted[k] = b
+                print("Head conversion: changing shape from {} to {}".format(shape_old, model_converted[k].shape))
+                continue
+        model_converted[k] = model_to_convert[old_k].detach()
+    
+    if args.mask:
+        # for mask, replace detr.backbone.0.backbone.stem.detr.conv1.weight -> 
+        # detr.detr.backbone.0.backbone.res2.0.conv1.weight
+        print('sovling for mask...')
+        model_converted_new = {}
+        for k in model_converted.keys():
+            old_k = k
+            if 'backbone' in k:
+                k = 'detr.' + k
+                k = k.replace('backbone.detr', 'backbone')
+                k = k.replace('stem.detr', 'stem')
+            print(old_k, "->", k)
+            model_converted_new[k] = model_converted[old_k].detach()
+        model_to_save = {"model": model_converted_new}
+        torch.save(model_to_save, args.output_model)
+    else:
+        model_to_save = {"model": model_converted}
+        torch.save(model_to_save, args.output_model)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/convert_detr_to_d2.py b/tools/convert_detr_to_d2.py
old mode 100755
new mode 100644
index 6fa5ff4..1dbbb34
--- a/tools/convert_detr_to_d2.py
+++ b/tools/convert_detr_to_d2.py
@@ -14,6 +14,8 @@ def parse_args():
 
     parser.add_argument("--source_model", default="", type=str, help="Path or url to the DETR model to convert")
     parser.add_argument("--output_model", default="", type=str, help="Path where to save the converted model")
+    parser.add_argument("--variant", default="detr", type=str, help="detr or anchordetr")
+    parser.add_argument("--mask", action="store_true", help="mask or not")
     return parser.parse_args()
 
 
@@ -29,6 +31,7 @@ def main():
     # fmt: on
 
     coco_idx = np.array(coco_idx)
+    va = args.variant
 
     if args.source_model.startswith("https"):
         checkpoint = torch.hub.load_state_dict_from_url(args.source_model, map_location="cpu", check_hash=True)
@@ -50,7 +53,7 @@ def main():
             k = k.replace("downsample.0", "shortcut")
             k = k.replace("downsample.1", "shortcut.norm")
             k = "backbone.0.backbone." + k
-        k = "detr." + k
+        k = f"{va}." + k
         print(old_k, "->", k)
         if "class_embed" in old_k:
             v = model_to_convert[old_k].detach()
@@ -60,9 +63,25 @@ def main():
                 print("Head conversion: changing shape from {} to {}".format(shape_old, model_converted[k].shape))
                 continue
         model_converted[k] = model_to_convert[old_k].detach()
-
-    model_to_save = {"model": model_converted}
-    torch.save(model_to_save, args.output_model)
+    
+    if args.mask:
+        # for mask, replace detr.backbone.0.backbone.stem.detr.conv1.weight -> 
+        # detr.detr.backbone.0.backbone.res2.0.conv1.weight
+        print('sovling for mask...')
+        model_converted_new = {}
+        for k in model_converted.keys():
+            old_k = k
+            if 'backbone' in k:
+                k = 'detr.' + k
+                k = k.replace('backbone.detr', 'backbone')
+                k = k.replace('stem.detr', 'stem')
+            print(old_k, "->", k)
+            model_converted_new[k] = model_converted[old_k].detach()
+        model_to_save = {"model": model_converted_new}
+        torch.save(model_to_save, args.output_model)
+    else:
+        model_to_save = {"model": model_converted}
+        torch.save(model_to_save, args.output_model)
 
 
 if __name__ == "__main__":
diff --git a/tools/convert_smcadetr_to_d2.py b/tools/convert_smcadetr_to_d2.py
new file mode 100644
index 0000000..1dbbb34
--- /dev/null
+++ b/tools/convert_smcadetr_to_d2.py
@@ -0,0 +1,88 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Helper script to convert models trained with the main version of DETR to be used with the Detectron2 version.
+"""
+import json
+import argparse
+
+import numpy as np
+import torch
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("D2 model converter")
+
+    parser.add_argument("--source_model", default="", type=str, help="Path or url to the DETR model to convert")
+    parser.add_argument("--output_model", default="", type=str, help="Path where to save the converted model")
+    parser.add_argument("--variant", default="detr", type=str, help="detr or anchordetr")
+    parser.add_argument("--mask", action="store_true", help="mask or not")
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    # D2 expects contiguous classes, so we need to remap the 92 classes from DETR
+    # fmt: off
+    coco_idx = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+                27, 28, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51,
+                52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 67, 70, 72, 73, 74, 75, 76, 77,
+                78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90, 91]
+    # fmt: on
+
+    coco_idx = np.array(coco_idx)
+    va = args.variant
+
+    if args.source_model.startswith("https"):
+        checkpoint = torch.hub.load_state_dict_from_url(args.source_model, map_location="cpu", check_hash=True)
+    else:
+        checkpoint = torch.load(args.source_model, map_location="cpu")
+    model_to_convert = checkpoint["model"]
+
+    model_converted = {}
+    for k in model_to_convert.keys():
+        old_k = k
+        if "backbone" in k:
+            k = k.replace("backbone.0.body.", "")
+            if "layer" not in k:
+                k = "stem." + k
+            for t in [1, 2, 3, 4]:
+                k = k.replace(f"layer{t}", f"res{t + 1}")
+            for t in [1, 2, 3]:
+                k = k.replace(f"bn{t}", f"conv{t}.norm")
+            k = k.replace("downsample.0", "shortcut")
+            k = k.replace("downsample.1", "shortcut.norm")
+            k = "backbone.0.backbone." + k
+        k = f"{va}." + k
+        print(old_k, "->", k)
+        if "class_embed" in old_k:
+            v = model_to_convert[old_k].detach()
+            if v.shape[0] == 92:
+                shape_old = v.shape
+                model_converted[k] = v[coco_idx]
+                print("Head conversion: changing shape from {} to {}".format(shape_old, model_converted[k].shape))
+                continue
+        model_converted[k] = model_to_convert[old_k].detach()
+    
+    if args.mask:
+        # for mask, replace detr.backbone.0.backbone.stem.detr.conv1.weight -> 
+        # detr.detr.backbone.0.backbone.res2.0.conv1.weight
+        print('sovling for mask...')
+        model_converted_new = {}
+        for k in model_converted.keys():
+            old_k = k
+            if 'backbone' in k:
+                k = 'detr.' + k
+                k = k.replace('backbone.detr', 'backbone')
+                k = k.replace('stem.detr', 'stem')
+            print(old_k, "->", k)
+            model_converted_new[k] = model_converted[old_k].detach()
+        model_to_save = {"model": model_converted_new}
+        torch.save(model_to_save, args.output_model)
+    else:
+        model_to_save = {"model": model_converted}
+        torch.save(model_to_save, args.output_model)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/demo_onnx_detr.py b/tools/demo_onnx_detr.py
old mode 100755
new mode 100644
diff --git a/tools/demo_trt_detr.py b/tools/demo_trt_detr.py
old mode 100755
new mode 100644
index 36fea42..0fa2a49
--- a/tools/demo_trt_detr.py
+++ b/tools/demo_trt_detr.py
@@ -46,7 +46,8 @@ def preprocess_np_no_normalize(img_path):
     print(img_path)
     print(im.shape)
     # img = transform(im).unsqueeze(0)
-    a = cv2.resize(im, (960, 768))
+    # a = cv2.resize(im, (960, 768))
+    a = cv2.resize(im, (1960, 1080))
     a = a.astype(np.float32)
     # a -= means
     # a /= stds
@@ -188,6 +189,7 @@ def main(onnx_model_file, image_dir, fp16=False, int8=False, batch_size=1, dynam
                         os.makedirs(save_dir)
                     # plot_box(img_raw, scores, boxs, prob_threshold=0.7,
                     #          save_fig=os.path.join(save_dir, test_image))
+                    print(res)
                     res = detr_postprocess(res, img_raw)
                     vis_res_fast(res, img_raw)
 
diff --git a/tools/eval_coco.py b/tools/eval_coco.py
old mode 100755
new mode 100644
diff --git a/tools/lazyconfig_train_net.py b/tools/lazyconfig_train_net.py
old mode 100755
new mode 100644
diff --git a/tools/quantize_d2.py b/tools/quantize_d2.py
new file mode 100644
index 0000000..5809ecd
--- /dev/null
+++ b/tools/quantize_d2.py
@@ -0,0 +1,376 @@
+"""
+Using Atom to quantize d2 models
+
+such as YOLOX
+
+this is WIP, not full work now.
+"""
+# Copyright (c) Facebook, Inc. and its affiliates.
+import argparse
+import glob
+import multiprocessing as mp
+import os
+import time
+import cv2
+from numpy.core.fromnumeric import sort
+import tqdm
+import torch
+import time
+import random
+from detectron2.data.detection_utils import read_image
+from detectron2.utils.logger import setup_logger
+
+import numpy as np
+from detectron2.data.catalog import MetadataCatalog
+from detectron2.config import get_cfg
+from detectron2.modeling import build_model
+import detectron2.data.transforms as T
+from detectron2.checkpoint import DetectionCheckpointer
+from yolov7.config import add_yolo_config
+from detectron2.data import (
+    MetadataCatalog,
+    build_detection_train_loader,
+    DatasetCatalog,
+)
+from detectron2.data import build_detection_test_loader
+
+from alfred.vis.image.mask import label2color_mask, vis_bitmasks
+from alfred.vis.image.det import visualize_det_cv2_part, visualize_det_cv2_fancy
+from alfred.dl.torch.common import device
+from detectron2.data.dataset_mapper import DatasetMapper
+from yolov7.data.dataset_mapper import MyDatasetMapper
+
+from atomquant.atom.prepare_by_platform import prepare_by_platform, BackendType
+from atomquant.atom.convert_deploy import convert_deploy
+from torchvision import transforms
+import torchvision
+import torch
+import yaml
+from easydict import EasyDict
+
+backend_dict = {
+    "Academic": BackendType.Academic,
+    "Tensorrt": BackendType.Tensorrt,
+    "SNPE": BackendType.SNPE,
+    "PPLW8A16": BackendType.PPLW8A16,
+    "NNIE": BackendType.NNIE,
+    "Vitis": BackendType.Vitis,
+    "ONNX_QNN": BackendType.ONNX_QNN,
+    "PPLCUDA": BackendType.PPLCUDA,
+}
+
+
+"""
+WIP.
+
+"""
+
+
+def parse_config(config_file):
+    with open(config_file) as f:
+        config = yaml.load(f, Loader=yaml.FullLoader)
+        cur_config = config
+        cur_path = config_file
+        while "root" in cur_config:
+            root_path = os.path.dirname(cur_path)
+            cur_path = os.path.join(root_path, cur_config["root"])
+            with open(cur_path) as r:
+                root_config = yaml.load(r, Loader=yaml.FullLoader)
+                for k, v in root_config.items():
+                    if k not in config:
+                        config[k] = v
+                cur_config = root_config
+        # config = yaml.safe_load(f)
+    config = EasyDict(config)
+    return config
+
+
+torch.set_grad_enabled(False)
+
+
+class DefaultPredictor:
+    def __init__(self, cfg):
+        self.cfg = cfg.clone()  # cfg can be modified by model
+        self.model = build_model(self.cfg)
+        self.model.eval()
+        if len(cfg.DATASETS.TEST):
+            self.metadata = MetadataCatalog.get(cfg.DATASETS.TEST[0])
+
+        checkpointer = DetectionCheckpointer(self.model)
+        checkpointer.load(cfg.MODEL.WEIGHTS)
+
+        self.aug = T.ResizeShortestEdge(
+            [cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST
+        )
+
+        self.input_format = cfg.INPUT.FORMAT
+        assert self.input_format in ["RGB", "BGR"], self.input_format
+
+    def __call__(self, original_image):
+        with torch.no_grad():
+            if self.input_format == "RGB":
+                original_image = original_image[:, :, ::-1]
+            height, width = original_image.shape[:2]
+            image = self.aug.get_transform(original_image).apply_image(original_image)
+            print("image after transform: ", image.shape)
+            # image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1))
+            # do not do transpose here
+            image = torch.as_tensor(image.astype("float32"))
+            inputs = {"image": image, "height": height, "width": width}
+            predictions = self.model([inputs])[0]
+            return predictions
+
+
+def setup_cfg(args):
+    # load config from file and command-line arguments
+    cfg = get_cfg()
+    add_yolo_config(cfg)
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+
+    cfg.MODEL.YOLO.CONF_THRESHOLD = 0.3
+    cfg.MODEL.YOLO.NMS_THRESHOLD = 0.6
+    cfg.MODEL.YOLO.IGNORE_THRESHOLD = 0.1
+
+    cfg.INPUT.MIN_SIZE_TEST = 672  # 90ms
+    # cfg.INPUT.MIN_SIZE_TEST = 512 # 70ms
+    # cfg.INPUT.MIN_SIZE_TEST = 1080  # 40ms
+    # cfg.INPUT.MAX_SIZE_TEST = 640 # 40ms
+    # cfg.INPUT.MAX_SIZE_TEST = 768 # 70ms
+    cfg.INPUT.MAX_SIZE_TEST = 1080  # 70ms
+    cfg.freeze()
+    return cfg
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(description="Detectron2 demo for builtin configs")
+    parser.add_argument(
+        "--config-file",
+        default="configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml",
+        metavar="FILE",
+        help="path to config file",
+    )
+    parser.add_argument(
+        "-qc",
+        default="configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml",
+        metavar="FILE",
+        help="quantize config file",
+    )
+    parser.add_argument(
+        "--opts",
+        help="Modify config options using the command-line 'KEY VALUE' pairs",
+        default=[],
+        nargs=argparse.REMAINDER,
+    )
+    return parser
+
+
+def load_test_image(f, h, w, bs=1):
+    a = cv2.imread(f)
+    a = cv2.resize(a, (w, h))
+    a_t = torch.tensor(a.astype(np.float32)).to(device).unsqueeze(0).repeat(bs, 1, 1, 1)
+    return a_t, a
+
+
+def load_test_image_detr(f, h, w):
+    """
+    detr do not using
+    """
+    a = cv2.imread(f)
+    a = cv2.resize(a, (w, h))
+    a_t = torch.tensor(a.astype(np.float32)).permute(2, 0, 1).to(device)
+    return (
+        torch.stack(
+            [
+                a_t,
+            ]
+        ),
+        a,
+    )
+
+
+def get_model_infos(config_file):
+    if "sparse_inst" in config_file:
+        # output_names = ["masks", "scores", "labels"]
+        output_names = ["masks", "scores"]
+        input_names = ["images"]
+        dynamic_axes = {"images": {0: "batch"}}
+        return input_names, output_names, dynamic_axes
+    elif "detr" in config_file:
+        return ["boxes", "scores", "labels"]
+    else:
+        return ["outs"]
+
+
+def load_calibrate_data(train_loader, cali_batchsize):
+    cali_data = []
+    for i, batch in enumerate(train_loader):
+        imgs = batch["images"]
+        print(imgs)
+        cali_data.append(batch[0])
+        if i + 1 == cali_batchsize:
+            break
+    return cali_data
+
+
+def get_quantize_model(model, config):
+    backend_type = (
+        BackendType.Academic
+        if not hasattr(config.quantize, "backend")
+        else backend_dict[config.quantize.backend]
+    )
+    extra_prepare_dict = (
+        {} if not hasattr(config, "extra_prepare_dict") else config.extra_prepare_dict
+    )
+    return prepare_by_platform(model, backend_type, extra_prepare_dict)
+
+
+def deploy(model, config):
+    backend_type = (
+        BackendType.Academic
+        if not hasattr(config.quantize, "backend")
+        else backend_dict[config.quantize.backend]
+    )
+    output_path = (
+        "./"
+        if not hasattr(config.quantize, "deploy")
+        else config.quantize.deploy.output_path
+    )
+    model_name = config.quantize.deploy.model_name
+    deploy_to_qlinear = (
+        False
+        if not hasattr(config.quantize.deploy, "deploy_to_qlinear")
+        else config.quantize.deploy.deploy_to_qlinear
+    )
+
+    convert_deploy(
+        model,
+        backend_type,
+        {"input": [1, 3, 224, 224]},
+        output_path=output_path,
+        model_name=model_name,
+        deploy_to_qlinear=deploy_to_qlinear,
+    )
+
+
+def evaluate_model(model, test_loader, criterion=None):
+    t0 = time.time()
+    model.eval()
+    model.to(device)
+    running_loss = 0
+    running_corrects = 0
+    for inputs, labels in test_loader:
+
+        inputs = inputs.to(device)
+        labels = labels.to(device)
+        outputs = model(inputs)
+        _, preds = torch.max(outputs, 1)
+
+        if criterion is not None:
+            loss = criterion(outputs, labels).item()
+        else:
+            loss = 0
+
+        # statistics
+        running_loss += loss * inputs.size(0)
+        running_corrects += torch.sum(preds == labels.data)
+
+    eval_loss = running_loss / len(test_loader.dataset)
+    eval_accuracy = running_corrects / len(test_loader.dataset)
+    t1 = time.time()
+    print(f"eval loss: {eval_loss}, eval acc: {eval_accuracy}, cost: {t1 - t0}")
+    return eval_loss, eval_accuracy
+
+
+def prepare_dataloader(cfg):
+    test_loader = build_detection_test_loader(
+        cfg, "coco_2017_val", mapper=MyDatasetMapper(cfg, True)
+    )
+    return test_loader
+
+
+if __name__ == "__main__":
+    mp.set_start_method("spawn", force=True)
+    args = get_parser().parse_args()
+    setup_logger(name="fvcore")
+    logger = setup_logger()
+    logger.info("Arguments: " + str(args))
+    cfg = setup_cfg(args)
+    metadata = MetadataCatalog.get(cfg.DATASETS.TEST[0])
+    predictor = DefaultPredictor(cfg)
+
+    model = predictor.model
+    # must in onnx export for PTQ, since we need export onnx later.
+    model.onnx_export = True
+
+    onnx_f = os.path.join(
+        "weights", os.path.basename(cfg.MODEL.WEIGHTS).split(".")[0] + ".onnx"
+    )
+    test_loader = prepare_dataloader(cfg)
+
+    config_f = args.qc
+    config = parse_config(config_f)
+    print(config)
+    model.to(device)
+    model.eval()
+
+    if hasattr(config, "quantize"):
+        model = get_quantize_model(model, config)
+        print("now model in quantized mode.")
+
+    model.to(device)
+    evaluate_model(model, test_loader)
+
+    # evaluate
+    if not hasattr(config, "quantize"):
+        evaluate_model(model, test_loader)
+    elif config.quantize.quantize_type == "advanced_ptq":
+        print("begin calibration now!")
+        cali_data = load_calibrate_data(
+            test_loader, cali_batchsize=config.quantize.cali_batchsize
+        )
+        from mqbench.utils.state import (
+            enable_quantization,
+            enable_calibration_woquantization,
+        )
+
+        # do activation and weight calibration seperately for quick MSE per-channel for weight one
+        model.eval()
+        enable_calibration_woquantization(model, quantizer_type="act_fake_quant")
+        for batch in cali_data:
+            model(batch.cuda())
+        enable_calibration_woquantization(model, quantizer_type="weight_fake_quant")
+        model(cali_data[0].cuda())
+        print("begin advanced PTQ now!")
+        if hasattr(config.quantize, "reconstruction"):
+            model = ptq_reconstruction(model, cali_data, config.quantize.reconstruction)
+        enable_quantization(model)
+        evaluate_model(model, test_loader)
+        if hasattr(config.quantize, "deploy"):
+            deploy(model, config)
+    elif config.quantize.quantize_type == "naive_ptq":
+        print("begin calibration now!")
+        cali_data = load_calibrate_data(
+            test_loader, cali_batchsize=config.quantize.cali_batchsize
+        )
+        from atomquant.atom.utils.state import (
+            enable_quantization,
+            enable_calibration_woquantization,
+        )
+
+        # do activation and weight calibration seperately for quick MSE per-channel for weight one
+        model.eval()
+        enable_calibration_woquantization(model, quantizer_type="act_fake_quant")
+        for batch in cali_data:
+            model(batch.to(device))
+        enable_calibration_woquantization(model, quantizer_type="weight_fake_quant")
+        model(cali_data[0].to(device))
+        print("begin quantization now!")
+        enable_quantization(model)
+        # print(model)
+        evaluate_model(model, test_loader)
+        if hasattr(config.quantize, "deploy"):
+            deploy(model, config)
+    else:
+        print("The quantize_type must in 'naive_ptq' or 'advanced_ptq',")
+        print("and 'advanced_ptq' need reconstruction configration.")
diff --git a/tools/train_detr.py b/tools/train_detr.py
old mode 100755
new mode 100644
diff --git a/train_cocomini.py b/train_cocomini.py
deleted file mode 100755
index e080a3c..0000000
--- a/train_cocomini.py
+++ /dev/null
@@ -1,125 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Facebook, Inc. and its affiliates.
-
-"""
-
-Training script using custom coco format dataset
-
-what you need to do is simply change the img_dir and annotation path here
-Also define your own categories.
-
-"""
-
-import os
-from detectron2.checkpoint import DetectionCheckpointer
-from detectron2.config import get_cfg
-from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch
-from detectron2.evaluation import COCOEvaluator
-from detectron2.data import MetadataCatalog, build_detection_train_loader, DatasetCatalog
-from detectron2.data.datasets.coco import load_coco_json, register_coco_instances
-from detectron2.data.dataset_mapper import DatasetMapper
-from detectron2.modeling import build_model
-from detectron2.utils import comm
-
-from yolov7.config import add_yolo_config
-from yolov7.data.dataset_mapper import MyDatasetMapper2, MyDatasetMapper
-from yolov7.utils.allreduce_norm import all_reduce_norm
-
-# print(MetadataCatalog.get('coco_2017_val_panoptic_separated'))
-
-# here is your dataset config
-CLASS_NAMES = MetadataCatalog.get('coco_2017_train').thing_classes
-DATASET_ROOT = './datasets/coco'
-ANN_ROOT = os.path.join(DATASET_ROOT, 'annotations')
-TRAIN_PATH = os.path.join(DATASET_ROOT, 'train2017')
-VAL_PATH = os.path.join(DATASET_ROOT, 'val2014')
-TRAIN_JSON = os.path.join(ANN_ROOT, 'instances_minitrain2017.json')
-VAL_JSON = os.path.join(ANN_ROOT, 'instances_minival2014.json')
-
-register_coco_instances("coco_2017_train_mini", {}, TRAIN_JSON, TRAIN_PATH)
-register_coco_instances("coco_2014_val_mini", {}, VAL_JSON, VAL_PATH)
-
-
-class Trainer(DefaultTrainer):
-    @classmethod
-    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
-        if output_folder is None:
-            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
-        return COCOEvaluator(dataset_name, output_dir=output_folder)
-
-    @classmethod
-    def build_train_loader(cls, cfg):
-        if cfg.MODEL.MASK_ON:
-            return build_detection_train_loader(cfg, mapper=MyDatasetMapper(cfg, True))
-        else:
-            # open mosaic aug
-            return build_detection_train_loader(cfg, mapper=MyDatasetMapper2(cfg, True))
-        # test our own dataset mapper to add more augmentations
-        # return build_detection_train_loader(cfg, mapper=MyDatasetMapper2(cfg, True))
-
-    @classmethod
-    def build_model(cls, cfg):
-        model = build_model(cfg)
-        # logger = logging.getLogger(__name__)
-        # logger.info("Model:\n{}".format(model))
-        return model
-
-    def run_step(self):
-        self._trainer.iter = self.iter
-        self._trainer.run_step()
-        if comm.get_world_size() == 1:
-            self.model.update_iter(self.iter)
-        else:
-            self.model.module.update_iter(self.iter)
-
-        # if comm.is_main_process():
-        #     # when eval period, apply all_reduce_norm as in https://github.com/Megvii-BaseDetection/YOLOX/issues/547#issuecomment-903220346
-        #     interval = self.cfg.SOLVER.CHECKPOINT_PERIOD if self.cfg.TEST.EVAL_PERIOD == 0 else self.cfg.TEST.EVAL_PERIOD
-        #     if self.iter % interval == 0:
-        #         all_reduce_norm(self.model)
-        #         self.checkpointer.save('latest')
-
-
-def setup(args):
-    """
-    Create configs and perform basic setups.
-    """
-    cfg = get_cfg()
-    add_yolo_config(cfg)
-    cfg.merge_from_file(args.config_file)
-    cfg.merge_from_list(args.opts)
-    cfg.freeze()
-    default_setup(cfg, args)
-    return cfg
-
-
-def main(args):
-    cfg = setup(args)
-
-    if args.eval_only:
-        model = Trainer.build_model(cfg)
-        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
-            cfg.MODEL.WEIGHTS, resume=args.resume
-        )
-        res = Trainer.test(cfg, model)
-        return res
-
-    trainer = Trainer(cfg)
-    trainer.resume_or_load(resume=args.resume)
-    # print('trainer.start: ', trainer.start_iter)
-    # trainer.model.iter = trainer.start_iter
-    # print('trainer.start: ', trainer.model.iter)
-    return trainer.train()
-
-
-if __name__ == "__main__":
-    args = default_argument_parser().parse_args()
-    print("Command Line Args:", args)
-    launch(
-        main,
-        args.num_gpus,
-        num_machines=args.num_machines,
-        machine_rank=args.machine_rank,
-        dist_url=args.dist_url,
-        args=(args,),
-    )
diff --git a/train_custom.py b/train_custom.py
deleted file mode 100755
index a19b77f..0000000
--- a/train_custom.py
+++ /dev/null
@@ -1,106 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Facebook, Inc. and its affiliates.
-
-"""
-
-Training script using custom coco format dataset
-
-what you need to do is simply change the img_dir and annotation path here
-Also define your own categories.
-
-"""
-
-import os
-from detectron2.checkpoint import DetectionCheckpointer
-from detectron2.config import get_cfg
-from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch
-from detectron2.evaluation import COCOEvaluator
-from detectron2.data import MetadataCatalog, build_detection_train_loader, DatasetCatalog
-from detectron2.data.datasets.coco import load_coco_json
-from detectron2.data.dataset_mapper import DatasetMapper
-
-from yolov7.config import add_yolo_config
-from yolov7.data.dataset_mapper import MyDatasetMapper
-
-
-# here is your dataset config
-CLASS_NAMES = ["face", 'face_mask']
-DATASET_ROOT = './datasets/facemask'
-ANN_ROOT = os.path.join(DATASET_ROOT, 'annotations')
-TRAIN_PATH = os.path.join(DATASET_ROOT, 'train')
-VAL_PATH = os.path.join(DATASET_ROOT, 'val')
-TRAIN_JSON = os.path.join(ANN_ROOT, 'instances_train2017.json')
-VAL_JSON = os.path.join(ANN_ROOT, 'instances_val2017.json')
-PREDEFINED_SPLITS_DATASET = {
-    "facemask_train": (TRAIN_PATH, TRAIN_JSON),
-    "facemask_val": (VAL_PATH, VAL_JSON),
-}
-
-
-def plain_register_dataset():
-    for k, v in PREDEFINED_SPLITS_DATASET.items():
-        DatasetCatalog.register(
-            k, lambda: load_coco_json(v[1], v[0]))
-        MetadataCatalog.get(k).set(thing_classes=CLASS_NAMES,
-                                   evaluator_type='coco',
-                                   json_file=v[1],
-                                   image_root=v[0])
-
-
-plain_register_dataset()
-
-
-class Trainer(DefaultTrainer):
-    @classmethod
-    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
-        if output_folder is None:
-            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
-        return COCOEvaluator(dataset_name, output_dir=output_folder)
-
-    @classmethod
-    def build_train_loader(cls, cfg):
-        # return build_detection_train_loader(cfg, mapper=DatasetMapper(cfg, True))
-        # test our own dataset mapper to add more augmentations
-        return build_detection_train_loader(cfg, mapper=MyDatasetMapper(cfg, True))
-
-
-def setup(args):
-    """
-    Create configs and perform basic setups.
-    """
-    cfg = get_cfg()
-    add_yolo_config(cfg)
-    cfg.merge_from_file(args.config_file)
-    cfg.merge_from_list(args.opts)
-    cfg.freeze()
-    default_setup(cfg, args)
-    return cfg
-
-
-def main(args):
-    cfg = setup(args)
-
-    if args.eval_only:
-        model = Trainer.build_model(cfg)
-        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
-            cfg.MODEL.WEIGHTS, resume=args.resume
-        )
-        res = Trainer.test(cfg, model)
-        return res
-
-    trainer = Trainer(cfg)
-    trainer.resume_or_load(resume=args.resume)
-    return trainer.train()
-
-
-if __name__ == "__main__":
-    args = default_argument_parser().parse_args()
-    print("Command Line Args:", args)
-    launch(
-        main,
-        args.num_gpus,
-        num_machines=args.num_machines,
-        machine_rank=args.machine_rank,
-        dist_url=args.dist_url,
-        args=(args,),
-    )
diff --git a/train_custom_datasets.py b/train_custom_datasets.py
new file mode 100755
index 0000000..6220cbf
--- /dev/null
+++ b/train_custom_datasets.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+"""
+
+Training script using custom coco format dataset
+
+what you need to do is simply change the img_dir and annotation path here
+Also define your own categories.
+
+"""
+
+import os
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.engine import (
+    default_argument_parser,
+    launch,
+)
+from detectron2.data.datasets.coco import load_coco_json, register_coco_instances
+from train_det import Trainer, setup
+
+
+def register_custom_datasets():
+    # facemask dataset
+    DATASET_ROOT = "./datasets/facemask"
+    ANN_ROOT = os.path.join(DATASET_ROOT, "annotations")
+    TRAIN_PATH = os.path.join(DATASET_ROOT, "train")
+    VAL_PATH = os.path.join(DATASET_ROOT, "val")
+    TRAIN_JSON = os.path.join(ANN_ROOT, "instances_train2017.json")
+    VAL_JSON = os.path.join(ANN_ROOT, "instances_val2017.json")
+    register_coco_instances("facemask_train", {}, TRAIN_JSON, TRAIN_PATH)
+    register_coco_instances("facemask_val", {}, VAL_JSON, VAL_PATH)
+
+    # tl dataset
+    DATASET_ROOT = "./datasets/tl"
+    ANN_ROOT = os.path.join(DATASET_ROOT, "annotations")
+    TRAIN_PATH = os.path.join(DATASET_ROOT, "JPEGImages")
+    VAL_PATH = os.path.join(DATASET_ROOT, "JPEGImages")
+    TRAIN_JSON = os.path.join(ANN_ROOT, "annotations_coco_tls_train.json")
+    VAL_JSON = os.path.join(ANN_ROOT, "annotations_coco_tls_val_val.json")
+    register_coco_instances("tl_train", {}, TRAIN_JSON, TRAIN_PATH)
+    register_coco_instances("tl_val", {}, VAL_JSON, VAL_PATH)
+
+    # visdrone dataset
+    DATASET_ROOT = "./datasets/visdrone"
+    ANN_ROOT = os.path.join(DATASET_ROOT, "visdrone_coco_anno")
+    TRAIN_PATH = os.path.join(DATASET_ROOT, "VisDrone2019-DET-train/images")
+    VAL_PATH = os.path.join(DATASET_ROOT, "VisDrone2019-DET-val/images")
+    TRAIN_JSON = os.path.join(ANN_ROOT, "VisDrone2019-DET_train_coco.json")
+    VAL_JSON = os.path.join(ANN_ROOT, "VisDrone2019-DET_val_coco.json")
+    register_coco_instances("visdrone_train", {}, TRAIN_JSON, TRAIN_PATH)
+    register_coco_instances("visdrone_val", {}, VAL_JSON, VAL_PATH)
+
+    # wearmask dataset
+    DATASET_ROOT = "./datasets/wearmask"
+    ANN_ROOT = os.path.join(DATASET_ROOT, "annotations")
+    TRAIN_PATH = os.path.join(DATASET_ROOT, "images/train2017")
+    VAL_PATH = os.path.join(DATASET_ROOT, "images/val2017")
+    TRAIN_JSON = os.path.join(ANN_ROOT, "train.json")
+    VAL_JSON = os.path.join(ANN_ROOT, "val.json")
+    register_coco_instances("mask_train", {}, TRAIN_JSON, TRAIN_PATH)
+    register_coco_instances("mask_val", {}, VAL_JSON, VAL_PATH)
+
+    # VOC dataset in coco format
+    DATASET_ROOT = "./datasets/voc"
+    ANN_ROOT = DATASET_ROOT
+    TRAIN_PATH = os.path.join(DATASET_ROOT, "JPEGImages")
+    VAL_PATH = os.path.join(DATASET_ROOT, "JPEGImages")
+    TRAIN_JSON = os.path.join(ANN_ROOT, "annotations_coco_train_2012.json")
+    VAL_JSON = os.path.join(ANN_ROOT, "annotations_coco_val_2012.json")
+
+    register_coco_instances("voc_train", {}, TRAIN_JSON, TRAIN_PATH)
+    register_coco_instances("voc_val", {}, VAL_JSON, VAL_PATH)
+
+    # ADD YOUR DATASET CONFIG HERE
+    # dataset names registered must be unique, different than any of above
+
+
+register_custom_datasets()
+
+
+def main(args):
+    cfg = setup(args)
+    if args.eval_only:
+        model = Trainer.build_model(cfg)
+        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
+            cfg.MODEL.WEIGHTS, resume=args.resume
+        )
+        res = Trainer.test(cfg, model)
+        return res
+
+    trainer = Trainer(cfg)
+    trainer.resume_or_load(resume=args.resume)
+    return trainer.train()
+
+
+if __name__ == "__main__":
+    args = default_argument_parser().parse_args()
+    launch(
+        main,
+        args.num_gpus,
+        num_machines=args.num_machines,
+        machine_rank=args.machine_rank,
+        dist_url=args.dist_url,
+        args=(args,),
+    )
diff --git a/train_voc.py b/train_det.py
similarity index 56%
rename from train_voc.py
rename to train_det.py
index 9f09c65..fa2b00d 100755
--- a/train_voc.py
+++ b/train_det.py
@@ -1,95 +1,75 @@
-#!/usr/bin/env python3
-# Copyright (c) Facebook, Inc. and its affiliates.
-
-"""
-
-Training script using custom coco format dataset
-
-what you need to do is simply change the img_dir and annotation path here
-Also define your own categories.
-
-"""
-
-from math import log
-import os
-from detectron2.checkpoint import DetectionCheckpointer
-from detectron2.config import get_cfg
-from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch
-from detectron2.evaluation import COCOEvaluator
-from detectron2.data import MetadataCatalog, build_detection_train_loader, DatasetCatalog
-from detectron2.data.datasets.coco import load_coco_json, register_coco_instances
-from detectron2.data.dataset_mapper import DatasetMapper
-
-from yolov7.config import add_yolo_config
-from yolov7.data.dataset_mapper import MyDatasetMapper, MyDatasetMapper2
-from loguru import logger
-
-
-# here is your dataset config
-
-DATASET_ROOT = './datasets/voc'
-ANN_ROOT = DATASET_ROOT
-TRAIN_PATH = os.path.join(DATASET_ROOT, 'JPEGImages')
-VAL_PATH = os.path.join(DATASET_ROOT, 'JPEGImages')
-TRAIN_JSON = os.path.join(ANN_ROOT, 'annotations_coco_train_2012.json')
-VAL_JSON = os.path.join(ANN_ROOT, 'annotations_coco_val_2012.json')
-
-register_coco_instances("voc_train", {}, TRAIN_JSON, TRAIN_PATH)
-register_coco_instances("voc_val", {}, VAL_JSON, VAL_PATH)
-
-
-class Trainer(DefaultTrainer):
-    @classmethod
-    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
-        if output_folder is None:
-            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
-        return COCOEvaluator(dataset_name, output_dir=output_folder)
-
-    @classmethod
-    def build_train_loader(cls, cfg):
-        # return build_detection_train_loader(cfg, mapper=DatasetMapper(cfg, True))
-        # test our own dataset mapper to add more augmentations
-        return build_detection_train_loader(cfg, mapper=MyDatasetMapper2(cfg, True))
-
-
-def setup(args):
-    """
-    Create configs and perform basic setups.
-    """
-    cfg = get_cfg()
-    add_yolo_config(cfg)
-    cfg.merge_from_file(args.config_file)
-    cfg.merge_from_list(args.opts)
-    cfg.freeze()
-    default_setup(cfg, args)
-    return cfg
-
-
-@logger.catch
-def main(args):
-    cfg = setup(args)
-
-    if args.eval_only:
-        model = Trainer.build_model(cfg)
-        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
-            cfg.MODEL.WEIGHTS, resume=args.resume
-        )
-        res = Trainer.test(cfg, model)
-        return res
-
-    trainer = Trainer(cfg)
-    trainer.resume_or_load(resume=args.resume)
-    return trainer.train()
-
-
-if __name__ == "__main__":
-    args = default_argument_parser().parse_args()
-    print("Command Line Args:", args)
-    launch(
-        main,
-        args.num_gpus,
-        num_machines=args.num_machines,
-        machine_rank=args.machine_rank,
-        dist_url=args.dist_url,
-        args=(args,),
-    )
+"""
+train detection entrance
+
+Copyright @2022 YOLOv7 authors
+
+"""
+import os
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import get_cfg
+from detectron2.engine import DefaultTrainer, default_argument_parser, launch
+from detectron2.evaluation import COCOEvaluator
+from detectron2.data import MetadataCatalog, build_detection_train_loader
+from detectron2.modeling import build_model
+from detectron2.utils import comm
+from yolov7.data.dataset_mapper import MyDatasetMapper, MyDatasetMapper2
+from yolov7.config import add_yolo_config
+from yolov7.utils.d2overrides import default_setup
+
+
+class Trainer(DefaultTrainer):
+
+    custom_mapper = None
+
+    @classmethod
+    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
+        if output_folder is None:
+            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
+        return COCOEvaluator(dataset_name, output_dir=output_folder)
+
+    @classmethod
+    def build_train_loader(cls, cfg):
+        cls.custom_mapper = MyDatasetMapper2(cfg, True)
+        return build_detection_train_loader(cfg, mapper=cls.custom_mapper)
+
+    @classmethod
+    def build_model(cls, cfg):
+        model = build_model(cfg)
+        return model
+
+
+def setup(args):
+    cfg = get_cfg()
+    add_yolo_config(cfg)
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    cfg.freeze()
+    default_setup(cfg, args)
+    return cfg
+
+
+def main(args):
+    cfg = setup(args)
+    if args.eval_only:
+        model = Trainer.build_model(cfg)
+        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
+            cfg.MODEL.WEIGHTS, resume=args.resume
+        )
+        res = Trainer.test(cfg, model)
+        return res
+
+    trainer = Trainer(cfg)
+    trainer.resume_or_load(resume=args.resume)
+    return trainer.train()
+
+
+if __name__ == "__main__":
+    args = default_argument_parser().parse_args()
+    launch(
+        main,
+        args.num_gpus,
+        num_machines=args.num_machines,
+        machine_rank=args.machine_rank,
+        dist_url=args.dist_url,
+        args=(args,),
+    )
diff --git a/train_inseg.py b/train_inseg.py
new file mode 100755
index 0000000..3943975
--- /dev/null
+++ b/train_inseg.py
@@ -0,0 +1,77 @@
+import os
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import get_cfg
+from detectron2.engine import (
+    DefaultTrainer,
+    default_argument_parser,
+    default_setup,
+    launch
+)
+from detectron2.data import build_detection_train_loader
+from detectron2.modeling import build_model
+
+from yolov7.config import add_yolo_config
+from yolov7.data.dataset_mapper import MyDatasetMapper, MyDatasetMapper2
+from yolov7.evaluation.coco_evaluation import COCOMaskEvaluator
+
+"""
+Script used for training instance segmentation, i.e. SparseInst.
+"""
+
+class Trainer(DefaultTrainer):
+
+    custom_mapper = None
+
+    @classmethod
+    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
+        if output_folder is None:
+            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
+        return COCOMaskEvaluator(dataset_name, output_dir=output_folder)
+
+    @classmethod
+    def build_train_loader(cls, cfg):
+        cls.custom_mapper = MyDatasetMapper(cfg, True)
+        return build_detection_train_loader(cfg, mapper=cls.custom_mapper)
+
+    @classmethod
+    def build_model(cls, cfg):
+        model = build_model(cfg)
+        return model
+
+
+def setup(args):
+    cfg = get_cfg()
+    add_yolo_config(cfg)
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    cfg.freeze()
+    default_setup(cfg, args)
+    return cfg
+
+
+def main(args):
+    cfg = setup(args)
+
+    if args.eval_only:
+        model = Trainer.build_model(cfg)
+        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
+            cfg.MODEL.WEIGHTS, resume=args.resume
+        )
+        res = Trainer.test(cfg, model)
+        return res
+
+    trainer = Trainer(cfg)
+    trainer.resume_or_load(resume=args.resume)
+    return trainer.train()
+
+
+if __name__ == "__main__":
+    args = default_argument_parser().parse_args()
+    launch(
+        main,
+        args.num_gpus,
+        num_machines=args.num_machines,
+        machine_rank=args.machine_rank,
+        dist_url=args.dist_url,
+        args=(args,),
+    )
diff --git a/train_net.py b/train_net.py
deleted file mode 100755
index 1be7170..0000000
--- a/train_net.py
+++ /dev/null
@@ -1,118 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Facebook, Inc. and its affiliates.
-
-"""
-TridentNet Training Script.
-
-This script is a simplified version of the training script in detectron2/tools.
-"""
-
-import os
-
-from detectron2.checkpoint import DetectionCheckpointer
-from detectron2.config import get_cfg
-from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch
-from detectron2.evaluation import COCOEvaluator
-from detectron2.data import MetadataCatalog, build_detection_train_loader, DatasetCatalog
-from detectron2.data.dataset_mapper import DatasetMapper
-from detectron2.modeling import build_model
-from detectron2.utils import comm
-import logging
-from detectron2.solver import build_lr_scheduler, LRMultiplier, WarmupParamScheduler
-from fvcore.common.param_scheduler import CosineParamScheduler
-
-from yolov7.data.dataset_mapper import MyDatasetMapper, MyDatasetMapper2
-from yolov7.config import add_yolo_config
-from yolov7.utils.allreduce_norm import all_reduce_norm
-
-
-class Trainer(DefaultTrainer):
-
-    custom_mapper = None
-
-    @classmethod
-    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
-        if output_folder is None:
-            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
-        return COCOEvaluator(dataset_name, output_dir=output_folder)
-
-    @classmethod
-    def build_train_loader(cls, cfg):
-        # return build_detection_train_loader(cfg, mapper=DatasetMapper(cfg, True))
-        # test our own dataset mapper to add more augmentations
-        cls.custom_mapper = MyDatasetMapper2(cfg, True)
-        return build_detection_train_loader(cfg, mapper=cls.custom_mapper)
-
-    @classmethod
-    def build_model(cls, cfg):
-        model = build_model(cfg)
-        # logger = logging.getLogger(__name__)
-        # logger.info("Model:\n{}".format(model))
-        return model
-
-    def run_step(self):
-        self._trainer.iter = self.iter
-        self._trainer.run_step()
-        if comm.get_world_size() == 1:
-            self.model.update_iter(self.iter)
-        else:
-            self.model.module.update_iter(self.iter)
-
-        if self.iter > self.cfg.INPUT.MOSAIC_AND_MIXUP.DISABLE_AT_ITER and self.cfg.INPUT.MOSAIC_AND_MIXUP.ENABLED:
-            # disable augmentation
-            self.cfg.defrost()
-            self.cfg.INPUT.MOSAIC_AND_MIXUP.ENABLED = False
-            self.cfg.freeze()
-            self.custom_mapper.disable_aug()
-
-        # if comm.is_main_process():
-        #     # when eval period, apply all_reduce_norm as in https://github.com/Megvii-BaseDetection/YOLOX/issues/547#issuecomment-903220346
-        #     interval = self.cfg.SOLVER.CHECKPOINT_PERIOD if self.cfg.TEST.EVAL_PERIOD == 0 else self.cfg.TEST.EVAL_PERIOD
-        #     if self.iter % interval == 0 and self.iter != 0:
-        #         all_reduce_norm(self.model)
-        #         self.checkpointer.save('latest')
-
-
-def setup(args):
-    """
-    Create configs and perform basic setups.
-    """
-    cfg = get_cfg()
-    add_yolo_config(cfg)
-    cfg.merge_from_file(args.config_file)
-    cfg.merge_from_list(args.opts)
-    cfg.freeze()
-    default_setup(cfg, args)
-    return cfg
-
-
-def main(args):
-    cfg = setup(args)
-
-    if args.eval_only:
-        model = Trainer.build_model(cfg)
-        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
-            cfg.MODEL.WEIGHTS, resume=args.resume
-        )
-        res = Trainer.test(cfg, model)
-        return res
-
-    trainer = Trainer(cfg)
-    trainer.resume_or_load(resume=args.resume)
-    # print('trainer.start: ', trainer.start_iter)
-    # trainer.model.iter = trainer.start_iter
-    # print('trainer.start: ', trainer.model.iter)
-    return trainer.train()
-
-
-if __name__ == "__main__":
-    args = default_argument_parser().parse_args()
-    print("Command Line Args:", args)
-    launch(
-        main,
-        args.num_gpus,
-        num_machines=args.num_machines,
-        machine_rank=args.machine_rank,
-        dist_url=args.dist_url,
-        args=(args,),
-    )
diff --git a/train_taco.py b/train_taco.py
deleted file mode 100755
index a70812b..0000000
--- a/train_taco.py
+++ /dev/null
@@ -1,165 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Facebook, Inc. and its affiliates.
-
-"""
-
-Training script using custom coco format dataset
-
-what you need to do is simply change the img_dir and annotation path here
-Also define your own categories.
-
-"""
-
-import os
-from detectron2.checkpoint import DetectionCheckpointer
-from detectron2.config import get_cfg
-from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch
-from detectron2.evaluation import COCOEvaluator
-from detectron2.data import MetadataCatalog, build_detection_train_loader, DatasetCatalog
-from detectron2.data.datasets.coco import load_coco_json
-from detectron2.data.dataset_mapper import DatasetMapper
-
-from yolov7.config import add_yolo_config
-from yolov7.data.dataset_mapper import MyDatasetMapper
-
-
-# here is your dataset config
-CLASS_NAMES = ['Aluminium foil',
-               'Battery',
-               'Aluminium blister pack',
-               'Carded blister pack',
-               'Other plastic bottle',
-               'Clear plastic bottle',
-               'Glass bottle',
-               'Plastic bottle cap',
-               'Metal bottle cap',
-               'Broken glass',
-               'Food Can',
-               'Aerosol',
-               'Drink can',
-               'Toilet tube',
-               'Other carton',
-               'Egg carton',
-               'Drink carton',
-               'Corrugated carton',
-               'Meal carton',
-               'Pizza box',
-               'Paper cup',
-               'Disposable plastic cup',
-               'Foam cup',
-               'Glass cup',
-               'Other plastic cup',
-               'Food waste',
-               'Glass jar',
-               'Plastic lid',
-               'Metal lid',
-               'Other plastic',
-               'Magazine paper',
-               'Tissues',
-               'Wrapping paper',
-               'Normal paper',
-               'Paper bag',
-               'Plastified paper bag',
-               'Plastic film',
-               'Six pack rings',
-               'Garbage bag',
-               'Other plastic wrapper',
-               'Single-use carrier bag',
-               'Polypropylene bag',
-               'Crisp packet',
-               'Spread tub',
-               'Tupperware',
-               'Disposable food container',
-               'Foam food container',
-               'Other plastic container',
-               'Plastic glooves',
-               'Plastic utensils',
-               'Pop tab',
-               'Rope & strings',
-               'Scrap metal',
-               'Shoe',
-               'Squeezable tube',
-               'Plastic straw',
-               'Paper straw',
-               'Styrofoam piece',
-               'Unlabeled litter',
-               'Cigarette']
-DATASET_ROOT = './datasets/taco'
-ANN_ROOT = os.path.join(DATASET_ROOT, 'annotations')
-TRAIN_PATH = os.path.join(DATASET_ROOT, 'images')
-VAL_PATH = os.path.join(DATASET_ROOT, 'images')
-TRAIN_JSON = os.path.join(ANN_ROOT, 'train_train.json')
-VAL_JSON = os.path.join(ANN_ROOT, 'train_val.json')
-PREDEFINED_SPLITS_DATASET = {
-    "taco_train": (TRAIN_PATH, TRAIN_JSON),
-    "taco_val": (VAL_PATH, VAL_JSON),
-}
-
-
-def plain_register_dataset():
-    for k, v in PREDEFINED_SPLITS_DATASET.items():
-        DatasetCatalog.register(
-            k, lambda: load_coco_json(v[1], v[0]))
-        MetadataCatalog.get(k).set(thing_classes=CLASS_NAMES,
-                                   evaluator_type='coco',
-                                   json_file=v[1],
-                                   image_root=v[0])
-
-
-plain_register_dataset()
-
-
-class Trainer(DefaultTrainer):
-    @classmethod
-    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
-        if output_folder is None:
-            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
-        return COCOEvaluator(dataset_name, output_dir=output_folder)
-
-    @classmethod
-    def build_train_loader(cls, cfg):
-        return build_detection_train_loader(cfg, mapper=DatasetMapper(cfg, True))
-        # test our own dataset mapper to add more augmentations
-        # return build_detection_train_loader(cfg, mapper=MyDatasetMapper(cfg, True))
-
-
-def setup(args):
-    """
-    Create configs and perform basic setups.
-    """
-    cfg = get_cfg()
-    add_yolo_config(cfg)
-    cfg.merge_from_file(args.config_file)
-    cfg.merge_from_list(args.opts)
-    cfg.freeze()
-    default_setup(cfg, args)
-    return cfg
-
-
-def main(args):
-    cfg = setup(args)
-
-    if args.eval_only:
-        model = Trainer.build_model(cfg)
-        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
-            cfg.MODEL.WEIGHTS, resume=args.resume
-        )
-        res = Trainer.test(cfg, model)
-        return res
-
-    trainer = Trainer(cfg)
-    trainer.resume_or_load(resume=args.resume)
-    return trainer.train()
-
-
-if __name__ == "__main__":
-    args = default_argument_parser().parse_args()
-    print("Command Line Args:", args)
-    launch(
-        main,
-        args.num_gpus,
-        num_machines=args.num_machines,
-        machine_rank=args.machine_rank,
-        dist_url=args.dist_url,
-        args=(args,),
-    )
diff --git a/train_tl.py b/train_tl.py
deleted file mode 100755
index 451aa53..0000000
--- a/train_tl.py
+++ /dev/null
@@ -1,94 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Facebook, Inc. and its affiliates.
-
-"""
-
-Training script using custom coco format dataset
-
-what you need to do is simply change the img_dir and annotation path here
-Also define your own categories.
-
-"""
-
-import os
-from datetime import timedelta
-from detectron2.checkpoint import DetectionCheckpointer
-from detectron2.config import get_cfg
-from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch
-from detectron2.evaluation import COCOEvaluator
-from detectron2.data import MetadataCatalog, build_detection_train_loader, DatasetCatalog
-from detectron2.data.datasets.coco import load_coco_json
-from detectron2.data.dataset_mapper import DatasetMapper
-from detectron2.data.datasets.coco import load_coco_json, register_coco_instances
-
-from yolov7.config import add_yolo_config
-from yolov7.data.dataset_mapper import MyDatasetMapper2
-
-
-# here is your dataset config
-DATASET_ROOT = './datasets/tl'
-ANN_ROOT = os.path.join(DATASET_ROOT, 'annotations')
-TRAIN_PATH = os.path.join(DATASET_ROOT, 'JPEGImages')
-VAL_PATH = os.path.join(DATASET_ROOT, 'JPEGImages')
-TRAIN_JSON = os.path.join(ANN_ROOT, 'annotations_coco_tls_train.json')
-VAL_JSON = os.path.join(ANN_ROOT, 'annotations_coco_tls_val.json')
-
-register_coco_instances("tl_train", {}, TRAIN_JSON, TRAIN_PATH)
-register_coco_instances("tl_val", {}, VAL_JSON, VAL_PATH)
-
-
-class Trainer(DefaultTrainer):
-    @classmethod
-    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
-        if output_folder is None:
-            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
-        return COCOEvaluator(dataset_name, output_dir=output_folder)
-
-    @classmethod
-    def build_train_loader(cls, cfg):
-        # return build_detection_train_loader(cfg, mapper=DatasetMapper(cfg, True))
-        # test our own dataset mapper to add more augmentations
-        return build_detection_train_loader(cfg, mapper=MyDatasetMapper2(cfg, True))
-
-
-def setup(args):
-    """
-    Create configs and perform basic setups.
-    """
-    cfg = get_cfg()
-    add_yolo_config(cfg)
-    cfg.merge_from_file(args.config_file)
-    cfg.merge_from_list(args.opts)
-    cfg.freeze()
-    default_setup(cfg, args)
-    return cfg
-
-
-def main(args):
-    cfg = setup(args)
-
-    if args.eval_only:
-        model = Trainer.build_model(cfg)
-        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
-            cfg.MODEL.WEIGHTS, resume=args.resume
-        )
-        res = Trainer.test(cfg, model)
-        return res
-
-    trainer = Trainer(cfg)
-    trainer.resume_or_load(resume=args.resume)
-    return trainer.train()
-
-
-if __name__ == "__main__":
-    args = default_argument_parser().parse_args()
-    print("Command Line Args:", args)
-    launch(
-        main,
-        args.num_gpus,
-        num_machines=args.num_machines,
-        machine_rank=args.machine_rank,
-        dist_url=args.dist_url,
-        timeout=timedelta(50),
-        args=(args,),
-    )
diff --git a/train_transformer.py b/train_transformer.py
new file mode 100755
index 0000000..ab34f22
--- /dev/null
+++ b/train_transformer.py
@@ -0,0 +1,203 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+DETR Training Script.
+
+This script is a simplified version of the training script in detectron2/tools.
+"""
+import os
+import sys
+import itertools
+import time
+from typing import Any, Dict, List, Set
+
+import torch
+from fvcore.nn.precise_bn import get_bn_modules
+
+import detectron2.utils.comm as comm
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import get_cfg
+from detectron2.data import MetadataCatalog, build_detection_train_loader
+from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch
+from detectron2.evaluation import COCOEvaluator, verify_results
+from detectron2.engine import hooks
+from detectron2.modeling import build_model
+from detectron2.solver.build import maybe_add_gradient_clipping
+
+from yolov7.data.dataset_mapper import DetrDatasetMapper
+from yolov7.config import add_yolo_config
+from yolov7.optimizer import build_optimizer_mapper
+
+
+class Trainer(DefaultTrainer):
+    """
+    Extension of the Trainer class adapted to DETR.
+    """
+
+    @classmethod
+    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
+        """
+        Create evaluator(s) for a given dataset.
+        This uses the special metadata "evaluator_type" associated with each builtin dataset.
+        For your own dataset, you can simply create an evaluator manually in your
+        script and do not have to worry about the hacky if-else logic here.
+        """
+        if output_folder is None:
+            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
+        return COCOEvaluator(dataset_name, cfg, True, output_folder)
+
+    @classmethod
+    def build_train_loader(cls, cfg):
+        if "detr" in cfg.MODEL.META_ARCHITECTURE.lower():
+            mapper = DetrDatasetMapper(cfg, True)
+        else:
+            mapper = None
+        return build_detection_train_loader(cfg, mapper=mapper)
+
+    @classmethod
+    def build_optimizer(cls, cfg, model):
+        # params: List[Dict[str, Any]] = []
+        # memo: Set[torch.nn.parameter.Parameter] = set()
+        # for key, value in model.named_parameters(recurse=True):
+        #     if not value.requires_grad:
+        #         continue
+        #     # Avoid duplicating parameters
+        #     if value in memo:
+        #         continue
+        #     memo.add(value)
+        #     lr = cfg.SOLVER.BASE_LR
+        #     weight_decay = cfg.SOLVER.WEIGHT_DECAY
+        #     if "backbone" in key:
+        #         lr = lr * cfg.SOLVER.BACKBONE_MULTIPLIER
+        #     params += [{"params": [value], "lr": lr,
+        #                 "weight_decay": weight_decay}]
+
+        # # optim: the optimizer class
+        # def maybe_add_full_model_gradient_clipping(optim):
+        #     # detectron2 doesn't have full model gradient clipping now
+        #     clip_norm_val = cfg.SOLVER.CLIP_GRADIENTS.CLIP_VALUE
+        #     enable = (
+        #         cfg.SOLVER.CLIP_GRADIENTS.ENABLED
+        #         and cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model"
+        #         and clip_norm_val > 0.0
+        #     )
+        #     class FullModelGradientClippingOptimizer(optim):
+        #         def step(self, closure=None):
+        #             all_params = itertools.chain(
+        #                 *[x["params"] for x in self.param_groups])
+        #             torch.nn.utils.clip_grad_norm_(all_params, clip_norm_val)
+        #             super().step(closure=closure)
+
+        #     return FullModelGradientClippingOptimizer if enable else optim
+
+        # optimizer_type = cfg.SOLVER.OPTIMIZER
+        # if optimizer_type == "SGD":
+        #     optimizer = maybe_add_full_model_gradient_clipping(torch.optim.SGD)(
+        #         params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM
+        #     )
+        # elif optimizer_type == "ADAMW":
+        #     optimizer = maybe_add_full_model_gradient_clipping(torch.optim.AdamW)(
+        #         params, cfg.SOLVER.BASE_LR
+        #     )
+        # else:
+        #     raise NotImplementedError(f"no optimizer type {optimizer_type}")
+        # if not cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model":
+        #     optimizer = maybe_add_gradient_clipping(cfg, optimizer)
+        # return optimizer
+        return build_optimizer_mapper(cfg, model)
+
+    def build_hooks(self):
+        """
+        Build a list of default hooks, including timing, evaluation,
+        checkpointing, lr scheduling, precise BN, writing events.
+
+        Returns:
+            list[HookBase]:
+        """
+        cfg = self.cfg.clone()
+        cfg.defrost()
+        cfg.DATALOADER.NUM_WORKERS = 0  # save some memory and time for PreciseBN
+
+        ret = [
+            hooks.IterationTimer(),
+            hooks.LRScheduler(),
+            hooks.PreciseBN(
+                # Run at the same freq as (but before) evaluation.
+                cfg.TEST.EVAL_PERIOD,
+                self.model,
+                # Build a new data loader to not affect training
+                self.build_train_loader(cfg),
+                cfg.TEST.PRECISE_BN.NUM_ITER,
+            )
+            if cfg.TEST.PRECISE_BN.ENABLED and get_bn_modules(self.model)
+            else None,
+        ]
+
+        # Do PreciseBN before checkpointer, because it updates the model and need to
+        # be saved by checkpointer.
+        # This is not always the best: if checkpointing has a different frequency,
+        # some checkpoints may have more precise statistics than others.
+        if comm.is_main_process():
+            ret.append(hooks.PeriodicCheckpointer(
+                self.checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD))
+
+        def test_and_save_results():
+            self._last_eval_results = self.test(self.cfg, self.model)
+            return self._last_eval_results
+
+        # Do evaluation after checkpointer, because then if it fails,
+        # we can use the saved checkpoint to debug.
+        ret.append(hooks.EvalHook(cfg.TEST.EVAL_PERIOD, test_and_save_results))
+
+        if comm.is_main_process():
+            # Here the default print/log frequency of each writer is used.
+            # run writers in the end, so that evaluation metrics are written
+            ret.append(hooks.PeriodicWriter(self.build_writers(), period=200))
+        return ret
+
+    @classmethod
+    def build_model(cls, cfg):
+        # remove print model
+        model = build_model(cfg)
+        return model
+
+def setup(args):
+    """
+    Create configs and perform basic setups.
+    """
+    cfg = get_cfg()
+    add_yolo_config(cfg)
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    cfg.freeze()
+    default_setup(cfg, args)
+    return cfg
+
+
+def main(args):
+    cfg = setup(args)
+
+    if args.eval_only:
+        model = Trainer.build_model(cfg)
+        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
+            cfg.MODEL.WEIGHTS, resume=args.resume)
+        res = Trainer.test(cfg, model)
+        if comm.is_main_process():
+            verify_results(cfg, res)
+        return res
+
+    trainer = Trainer(cfg)
+    trainer.resume_or_load(resume=args.resume)
+    return trainer.train()
+
+
+if __name__ == "__main__":
+    args = default_argument_parser().parse_args()
+    print("Command Line Args:", args)
+    launch(
+        main,
+        args.num_gpus,
+        num_machines=args.num_machines,
+        machine_rank=args.machine_rank,
+        dist_url=args.dist_url,
+        args=(args,),
+    )
diff --git a/train_visdrone.py b/train_visdrone.py
deleted file mode 100755
index 047dc60..0000000
--- a/train_visdrone.py
+++ /dev/null
@@ -1,98 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Facebook, Inc. and its affiliates.
-
-"""
-
-Training script using custom coco format dataset
-
-what you need to do is simply change the img_dir and annotation path here
-Also define your own categories.
-
-"""
-
-import os
-from detectron2.checkpoint import DetectionCheckpointer
-from detectron2.config import get_cfg
-from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch
-from detectron2.evaluation import COCOEvaluator
-from detectron2.data import MetadataCatalog, build_detection_train_loader, DatasetCatalog
-from detectron2.data.datasets.coco import load_coco_json, register_coco_instances
-from detectron2.data.dataset_mapper import DatasetMapper
-from detectron2.modeling import build_model
-
-from yolov7.config import add_yolo_config
-from yolov7.data.dataset_mapper import MyDatasetMapper2
-
-
-# here is your dataset config
-
-DATASET_ROOT = './datasets/visdrone'
-ANN_ROOT = os.path.join(DATASET_ROOT, 'visdrone_coco_anno')
-TRAIN_PATH = os.path.join(DATASET_ROOT, 'VisDrone2019-DET-train/images')
-VAL_PATH = os.path.join(DATASET_ROOT, 'VisDrone2019-DET-val/images')
-TRAIN_JSON = os.path.join(ANN_ROOT, 'VisDrone2019-DET_train_coco.json')
-VAL_JSON = os.path.join(ANN_ROOT, 'VisDrone2019-DET_val_coco.json')
-
-register_coco_instances("visdrone_train", {}, TRAIN_JSON, TRAIN_PATH)
-register_coco_instances("visdrone_val", {}, VAL_JSON, VAL_PATH)
-
-
-class Trainer(DefaultTrainer):
-    @classmethod
-    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
-        if output_folder is None:
-            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
-        return COCOEvaluator(dataset_name, output_dir=output_folder)
-    
-    @classmethod
-    def build_model(cls, cfg):
-        model = build_model(cfg)
-        return model
-
-    @classmethod
-    def build_train_loader(cls, cfg):
-        # return build_detection_train_loader(cfg, mapper=DatasetMapper(cfg, True))
-        # test our own dataset mapper to add more augmentations
-        return build_detection_train_loader(cfg, mapper=MyDatasetMapper2(cfg, True))
-
-
-def setup(args):
-    """
-    Create configs and perform basic setups.
-    """
-    cfg = get_cfg()
-    add_yolo_config(cfg)
-    cfg.merge_from_file(args.config_file)
-    cfg.merge_from_list(args.opts)
-    cfg.freeze()
-    default_setup(cfg, args)
-    return cfg
-
-
-def main(args):
-    cfg = setup(args)
-
-    if args.eval_only:
-        model = Trainer.build_model(cfg)
-        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
-            cfg.MODEL.WEIGHTS, resume=args.resume
-        )
-        res = Trainer.test(cfg, model)
-        return res
-
-    trainer = Trainer(cfg)
-    trainer.resume_or_load(resume=args.resume)
-    return trainer.train()
-
-
-if __name__ == "__main__":
-    args = default_argument_parser().parse_args()
-    print("Command Line Args:", args)
-    launch(
-        main,
-        args.num_gpus,
-        num_machines=args.num_machines,
-        machine_rank=args.machine_rank,
-        dist_url=args.dist_url,
-        args=(args,),
-    )
diff --git a/train_wearmask.py b/train_wearmask.py
deleted file mode 100755
index abb56ec..0000000
--- a/train_wearmask.py
+++ /dev/null
@@ -1,106 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Facebook, Inc. and its affiliates.
-
-"""
-
-Training script using custom coco format dataset
-
-what you need to do is simply change the img_dir and annotation path here
-Also define your own categories.
-
-"""
-
-import os
-from detectron2.checkpoint import DetectionCheckpointer
-from detectron2.config import get_cfg
-from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch
-from detectron2.evaluation import COCOEvaluator
-from detectron2.data import MetadataCatalog, build_detection_train_loader, DatasetCatalog
-from detectron2.data.datasets.coco import load_coco_json
-from detectron2.data.dataset_mapper import DatasetMapper
-
-from yolov7.config import add_yolo_config
-from yolov7.data.dataset_mapper import MyDatasetMapper2
-
-
-# here is your dataset config
-CLASS_NAMES = ['mask', 'no_mask']
-DATASET_ROOT = './datasets/wearmask'
-ANN_ROOT = os.path.join(DATASET_ROOT, 'annotations')
-TRAIN_PATH = os.path.join(DATASET_ROOT, 'images/train2017')
-VAL_PATH = os.path.join(DATASET_ROOT, 'images/val2017')
-TRAIN_JSON = os.path.join(ANN_ROOT, 'train.json')
-VAL_JSON = os.path.join(ANN_ROOT, 'val.json')
-PREDEFINED_SPLITS_DATASET = {
-    "mask_train": (TRAIN_PATH, TRAIN_JSON),
-    "mask_val": (VAL_PATH, VAL_JSON),
-}
-
-
-def plain_register_dataset():
-    for k, v in PREDEFINED_SPLITS_DATASET.items():
-        DatasetCatalog.register(
-            k, lambda: load_coco_json(v[1], v[0]))
-        MetadataCatalog.get(k).set(thing_classes=CLASS_NAMES,
-                                   evaluator_type='coco',
-                                   json_file=v[1],
-                                   image_root=v[0])
-
-
-plain_register_dataset()
-
-
-class Trainer(DefaultTrainer):
-    @classmethod
-    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
-        if output_folder is None:
-            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
-        return COCOEvaluator(dataset_name, output_dir=output_folder)
-
-    @classmethod
-    def build_train_loader(cls, cfg):
-        # return build_detection_train_loader(cfg, mapper=DatasetMapper(cfg, True))
-        # test our own dataset mapper to add more augmentations
-        return build_detection_train_loader(cfg, mapper=MyDatasetMapper2(cfg, True))
-
-
-def setup(args):
-    """
-    Create configs and perform basic setups.
-    """
-    cfg = get_cfg()
-    add_yolo_config(cfg)
-    cfg.merge_from_file(args.config_file)
-    cfg.merge_from_list(args.opts)
-    cfg.freeze()
-    default_setup(cfg, args)
-    return cfg
-
-
-def main(args):
-    cfg = setup(args)
-
-    if args.eval_only:
-        model = Trainer.build_model(cfg)
-        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
-            cfg.MODEL.WEIGHTS, resume=args.resume
-        )
-        res = Trainer.test(cfg, model)
-        return res
-
-    trainer = Trainer(cfg)
-    trainer.resume_or_load(resume=args.resume)
-    return trainer.train()
-
-
-if __name__ == "__main__":
-    args = default_argument_parser().parse_args()
-    print("Command Line Args:", args)
-    launch(
-        main,
-        args.num_gpus,
-        num_machines=args.num_machines,
-        machine_rank=args.machine_rank,
-        dist_url=args.dist_url,
-        args=(args,),
-    )
diff --git a/wandadb/__init__.py b/wandadb/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/wandadb/wandb_logger.py b/wandadb/wandb_logger.py
new file mode 100644
index 0000000..480c80a
--- /dev/null
+++ b/wandadb/wandb_logger.py
@@ -0,0 +1,207 @@
+import importlib
+from pathlib import Path
+from typing import Union, Dict, Any
+
+import cv2
+import numpy as np
+import wandb
+from alfred.vis.image.get_dataset_label_map import coco_label_map_list
+from detectron2.utils.visualizer import GenericMask
+
+coco_label_map = {k: v for k, v in enumerate(coco_label_map_list) if isinstance(v, str)}
+
+
+def is_wandb_available():
+    return importlib.util.find_spec("wandb") is not None
+
+
+class WandbFormatter:
+    """Converts detectron2 output to wandb.Image arguments"""
+
+    def __init__(
+        self,
+        image_path: Union[str, Path],
+        class_names: Dict[Any, Any],
+        conf_threshold: float = 0.7,
+    ):
+        self.image_path = image_path
+        self.image = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
+        self.class_names = class_names
+        self.conf_threshold = conf_threshold
+        self.class_set = wandb.Classes(
+            [{"id": idx, "name": name} for idx, name in self.class_names.items()]
+        )
+
+    def convert_instance_predictions(self, predictions):
+        """
+        Converts instance-level prediction results for an image.
+
+        Args:
+            predictions (Instances): the output of an instance detection/segmentation
+                model. Following fields will be used to create the final dictionary to pass to wandb
+                "pred_boxes", "pred_classes", "scores", "pred_masks".
+
+        Returns:
+            output Dict[str,Any]: image with kwargs for wandb logger
+        """
+        boxes = (
+            predictions.pred_boxes.tensor.cpu().numpy().tolist()
+            if predictions.has("pred_boxes")
+            else None
+        )
+        scores = (
+            predictions.scores.cpu().numpy().tolist()
+            if predictions.has("scores")
+            else None
+        )
+        classes = (
+            predictions.pred_classes.cpu().numpy().tolist()
+            if predictions.has("pred_classes")
+            else None
+        )
+
+        if predictions.has("pred_masks"):
+            masks = predictions.pred_masks.cpu().numpy()
+            masks = [
+                GenericMask(x, self.image.shape[0], self.image.shape[1]) for x in masks
+            ]
+        else:
+            masks = None
+
+        num_objects = 0
+        confidences = []
+
+        if masks is not None:
+            boxes = []
+            final_mask = np.zeros(
+                (self.image.shape[0], self.image.shape[1]), dtype=np.uint8
+            )
+            for i, mask in enumerate(masks):
+                if scores[i] > self.conf_threshold:
+                    pred_mask = mask.mask
+                    try:
+                        boxes.append(mask.bbox())
+                    except IndexError:
+                        pass
+                    pred_class = int(classes[i]) + 1
+                    final_mask = np.ma.array(final_mask, mask=pred_mask)
+                    final_mask = final_mask.filled(pred_class)
+                    num_objects += 1
+                    confidences.append(scores[i])
+                final_mask = final_mask.astype(np.uint8)
+                masks = {
+                    "prediction": {
+                        "mask_data": final_mask,
+                        "class_labels": self.class_names,
+                    }
+                }
+        if boxes is not None:
+            boxes_data = []
+            for i, box in enumerate(boxes):
+                if scores[i] > self.conf_threshold:
+                    pred_class = int(classes[i]) + 1
+                    caption = (
+                        f"{pred_class}"
+                        if not self.class_names
+                        else self.class_names[pred_class]
+                    )
+                    boxes_data.append(
+                        {
+                            "position": {
+                                "minX": box[0],
+                                "minY": box[1],
+                                "maxX": box[2],
+                                "maxY": box[3],
+                            },
+                            "class_id": pred_class,
+                            "box_caption": f"{i}: {caption} @ {scores[i] * 100:.2f}%",
+                            "scores": {"class_score": scores[i]},
+                            "domain": "pixel",
+                        }
+                    )
+                    if masks is None:
+                        confidences.append(scores[i])
+                        num_objects += 1
+            if boxes_data:
+                boxes = {
+                    "prediction": {
+                        "box_data": boxes_data,
+                        "class_labels": self.class_names,
+                    }
+                }
+            else:
+                boxes = None
+        row = (
+            str(Path(self.image_path).name),
+            wandb.Image(
+                data_or_path=self.image_path,
+                boxes=boxes,
+                masks=masks,
+                classes=self.class_set,
+            ),
+            num_objects,
+            confidences,
+        )
+        return row
+
+
+class WandbInferenceLogger:
+    """
+    Logs inference images and predictions to wandb.
+    Currently, supports bounding boxes and instance segmentation.
+    """
+
+    def __init__(
+        self,
+        wandb_entity: str = None,
+        wandb_project: str = None,
+        class_names: Dict[int, str] = None,
+        conf_threshold: float = 0.7,
+        config=None,
+    ):
+        if not is_wandb_available():
+            raise ImportError(
+                "Please install wandb using 'pip install wandb --upgrade'"
+            )
+
+        self.class_names = class_names if class_names else coco_label_map
+        self.wandb = wandb
+        self.run = None
+        if wandb.run:
+            self.run = wandb.run
+        else:
+            if wandb_project is None:
+                raise ValueError("wandb_project is required for wandb logger ")
+            self.run = wandb.init(
+                project=wandb_project, entity=wandb_entity, config=config,
+            )
+        self.dataset_name = self.run.id + "_dataset"
+        self.conf_threshold = conf_threshold
+        self.table: wandb.Table = self.wandb.Table(
+            columns=[
+                "Image-File",
+                "Predictions",
+                "Number-of-Objects",
+                "Prediction-Confidence",
+            ]
+        )
+
+    def log_inference(self, image, result):
+        """adds the inference result to a table in wandb."""
+        if not self.run:
+            return None
+        formatter = WandbFormatter(
+            image, class_names=self.class_names, conf_threshold=self.conf_threshold
+        )
+        image_name = str(Path(image).stem)
+        instance_prediction = formatter.convert_instance_predictions(
+            result["instances"]
+        )
+        self.table.add_data(*instance_prediction)
+
+    def finish_run(self):
+        """Uploads the table to wandb, finishes the run."""
+        if not self.run:
+            return None
+        self.run.log({self.dataset_name: self.table})
+        self.run.finish()
diff --git a/weights/.gitignore b/weights/.gitignore
new file mode 100644
index 0000000..c18880d
--- /dev/null
+++ b/weights/.gitignore
@@ -0,0 +1,3 @@
+*
+!.gitignore
+!get_models.sh
diff --git a/weights/get_models.sh b/weights/get_models.sh
new file mode 100644
index 0000000..5752017
--- /dev/null
+++ b/weights/get_models.sh
@@ -0,0 +1 @@
+gdown https://drive.google.com/file/d/1MK8rO3qtA7vN9KVSBdp0VvZHCNq8-bvz/view\?usp\=sharing --fuzzy
diff --git a/yolov7/__init__.py b/yolov7/__init__.py
old mode 100755
new mode 100644
diff --git a/yolov7/config.py b/yolov7/config.py
old mode 100755
new mode 100644
index d6f2fce..a6a584c
--- a/yolov7/config.py
+++ b/yolov7/config.py
@@ -2,6 +2,10 @@
 # Copyright (c) Facebook, Inc. and its affiliates.
 
 from detectron2.config import CfgNode as CN
+from .utils.get_default_cfg import get_default_solver_configs
+from .modeling.backbone.cfg import add_fbnet_v2_default_configs
+from .configs.config_sparseinst import add_sparse_inst_config
+from .configs.config_convnext import add_convnext_default_configs
 
 
 def add_yolo_config(cfg):
@@ -10,18 +14,27 @@ def add_yolo_config(cfg):
     """
     _C = cfg
 
+    get_default_solver_configs(_C)
+    add_fbnet_v2_default_configs(_C)
+    add_sparse_inst_config(_C)
+    add_convnext_default_configs(_C)
+
+    _C.DATASETS.CLASS_NAMES = []
+
     # Allowed values are 'normal', 'softnms-linear', 'softnms-gaussian', 'cluster'
     _C.MODEL.NMS_TYPE = "normal"
     _C.MODEL.ONNX_EXPORT = False
     _C.MODEL.PADDED_VALUE = 114.0
     _C.MODEL.FPN.REPEAT = 2
     _C.MODEL.FPN.OUT_CHANNELS_LIST = [256, 512, 1024]
+    # _C.MODEL.BACKBONE.STRIDE = []
+    # _C.MODEL.BACKBONE.CHANNEL = []
 
     # Add Bi-FPN support
     _C.MODEL.BIFPN = CN()
     _C.MODEL.BIFPN.NUM_LEVELS = 5
     _C.MODEL.BIFPN.NUM_BIFPN = 6
-    _C.MODEL.BIFPN.NORM = 'GN'
+    _C.MODEL.BIFPN.NORM = "GN"
     _C.MODEL.BIFPN.OUT_CHANNELS = 160
     _C.MODEL.BIFPN.SEPARABLE_CONV = False
 
@@ -35,20 +48,20 @@ def add_yolo_config(cfg):
     _C.SOLVER.LR_SCHEDULER.MAX_ITER = 40000
     _C.SOLVER.LR_SCHEDULER.MAX_EPOCH = 500
     _C.SOLVER.LR_SCHEDULER.STEPS = (30000,)
-    _C.SOLVER.LR_SCHEDULER.WARMUP_FACTOR = 1.0/1000
+    _C.SOLVER.LR_SCHEDULER.WARMUP_FACTOR = 1.0 / 1000
     _C.SOLVER.LR_SCHEDULER.WARMUP_ITERS = 1000
     _C.SOLVER.LR_SCHEDULER.WARMUP_METHOD = "linear"
     _C.SOLVER.LR_SCHEDULER.GAMMA = 0.1
 
     # Add Input
-    _C.INPUT.INPUT_SIZE = [640, 640] # h,w order
+    _C.INPUT.INPUT_SIZE = [640, 640]  # h,w order
 
     # Add yolo config
     _C.MODEL.YOLO = CN()
     _C.MODEL.YOLO.NUM_BRANCH = 3
     _C.MODEL.YOLO.BRANCH_DILATIONS = [1, 2, 3]
     _C.MODEL.YOLO.TEST_BRANCH_IDX = 1
-    _C.MODEL.YOLO.VARIANT = 'yolov3'  # can be yolov5 yolov7 as well
+    _C.MODEL.YOLO.VARIANT = "yolov3"  # can be yolov5 yolov7 as well
     _C.MODEL.YOLO.ANCHORS = [
         [[116, 90], [156, 198], [373, 326]],
         [[30, 61], [62, 45], [42, 119]],
@@ -61,6 +74,7 @@ def add_yolo_config(cfg):
     _C.MODEL.YOLO.CONF_THRESHOLD = 0.01
     _C.MODEL.YOLO.NMS_THRESHOLD = 0.5
     _C.MODEL.YOLO.IGNORE_THRESHOLD = 0.07
+    _C.MODEL.YOLO.NORMALIZE_INPUT = False
 
     _C.MODEL.YOLO.WIDTH_MUL = 1.0
     _C.MODEL.YOLO.DEPTH_MUL = 1.0
@@ -79,8 +93,11 @@ def add_yolo_config(cfg):
     _C.MODEL.YOLO.LOSS.BUILD_TARGET_TYPE = "default"
 
     _C.MODEL.YOLO.NECK = CN()
-    _C.MODEL.YOLO.NECK.TYPE = "yolov3" # default is FPN, can be pafpn as well
-    _C.MODEL.YOLO.NECK.WITH_SPP = False #
+    _C.MODEL.YOLO.NECK.TYPE = "yolov3"  # default is FPN, can be pafpn as well
+    _C.MODEL.YOLO.NECK.WITH_SPP = False  #
+
+    _C.MODEL.YOLO.HEAD = CN()
+    _C.MODEL.YOLO.HEAD.TYPE = "yolox"
 
     _C.MODEL.YOLO.ORIEN_HEAD = CN()
     _C.MODEL.YOLO.ORIEN_HEAD.UP_CHANNELS = 64
@@ -114,17 +131,28 @@ def add_yolo_config(cfg):
     _C.MODEL.EFFICIENTNET.NAME = "efficientnet_b0"
     _C.MODEL.EFFICIENTNET.PRETRAINED = True
     _C.MODEL.EFFICIENTNET.FEATURE_INDICES = [1, 4, 10, 15]
-    _C.MODEL.EFFICIENTNET.OUT_FEATURES = [
-        "stride4", "stride8", "stride16", "stride32"]
+    _C.MODEL.EFFICIENTNET.OUT_FEATURES = ["stride4", "stride8", "stride16", "stride32"]
+
+    # _C.MODEL.BACKBONE = CN()
+    _C.MODEL.BACKBONE.SUBTYPE = "s"
+    _C.MODEL.BACKBONE.PRETRAINED = True
+    _C.MODEL.BACKBONE.WEIGHTS = ""
+    _C.MODEL.BACKBONE.FEATURE_INDICES = [1, 4, 10, 15]
+    _C.MODEL.BACKBONE.OUT_FEATURES = ["stride8", "stride16", "stride32"]
 
-    
     # add SOLOv2 options
     _C.MODEL.SOLOV2 = CN()
 
     # Instance hyper-parameters
     _C.MODEL.SOLOV2.INSTANCE_IN_FEATURES = ["p2", "p3", "p4", "p5", "p6"]
     _C.MODEL.SOLOV2.FPN_INSTANCE_STRIDES = [8, 8, 16, 32, 32]
-    _C.MODEL.SOLOV2.FPN_SCALE_RANGES = ((1, 96), (48, 192), (96, 384), (192, 768), (384, 2048))
+    _C.MODEL.SOLOV2.FPN_SCALE_RANGES = (
+        (1, 96),
+        (48, 192),
+        (96, 384),
+        (192, 768),
+        (384, 2048),
+    )
     _C.MODEL.SOLOV2.SIGMA = 0.2
     # Channel size for the instance head.
     _C.MODEL.SOLOV2.INSTANCE_IN_CHANNELS = 256
@@ -132,7 +160,7 @@ def add_yolo_config(cfg):
     # Convolutions to use in the instance head.
     _C.MODEL.SOLOV2.NUM_INSTANCE_CONVS = 4
     _C.MODEL.SOLOV2.USE_DCN_IN_INSTANCE = False
-    _C.MODEL.SOLOV2.TYPE_DCN = 'DCN'
+    _C.MODEL.SOLOV2.TYPE_DCN = "DCN"
     _C.MODEL.SOLOV2.NUM_GRIDS = [40, 36, 24, 16, 12]
     # Number of foreground classes.
     _C.MODEL.SOLOV2.NUM_CLASSES = 80
@@ -171,27 +199,50 @@ def add_yolo_config(cfg):
     # DETR config
     cfg.MODEL.DETR = CN()
     cfg.MODEL.DETR.NUM_CLASSES = 80
+    cfg.MODEL.BACKBONE.SIMPLE = False
+    cfg.MODEL.BACKBONE.STRIDE = 1
+    cfg.MODEL.BACKBONE.CHANNEL = 0
+
+    # FBNet
+    cfg.MODEL.FBNET_V2.OUT_FEATURES = ["trunk3"]
+
     # For Segmentation
-    cfg.MODEL.DETR.FROZEN_WEIGHTS = ''
+    cfg.MODEL.DETR.FROZEN_WEIGHTS = ""
     # LOSS
+    cfg.MODEL.DETR.DEFORMABLE = False
+    cfg.MODEL.DETR.USE_FOCAL_LOSS = False
+    cfg.MODEL.DETR.CENTERED_POSITION_ENCODIND = False
+    cfg.MODEL.DETR.CLS_WEIGHT = 1.0
+    cfg.MODEL.DETR.NUM_FEATURE_LEVELS = 4
     cfg.MODEL.DETR.GIOU_WEIGHT = 2.0
     cfg.MODEL.DETR.L1_WEIGHT = 5.0
     cfg.MODEL.DETR.DEEP_SUPERVISION = True
     cfg.MODEL.DETR.NO_OBJECT_WEIGHT = 0.1
+    cfg.MODEL.DETR.WITH_BOX_REFINE = False
+    cfg.MODEL.DETR.TWO_STAGE = False
+    cfg.MODEL.DETR.DECODER_BLOCK_GRAD = True
+
     # TRANSFORMER
+    cfg.MODEL.DETR.ATTENTION_TYPE = "DETR"  # can be SMCA, RCDA
     cfg.MODEL.DETR.NHEADS = 8
     cfg.MODEL.DETR.DROPOUT = 0.1
     cfg.MODEL.DETR.DIM_FEEDFORWARD = 2048
     cfg.MODEL.DETR.ENC_LAYERS = 6
     cfg.MODEL.DETR.DEC_LAYERS = 6
     cfg.MODEL.DETR.PRE_NORM = False
+    cfg.MODEL.DETR.BBOX_EMBED_NUM_LAYERS = 3
     cfg.MODEL.DETR.HIDDEN_DIM = 256
     cfg.MODEL.DETR.NUM_OBJECT_QUERIES = 100
+    cfg.MODEL.DETR.FROZEN_WEIGHTS = ""
+    cfg.MODEL.DETR.NUM_FEATURE_LEVELS = 1  # can be 3 tambien
+    # for AnchorDETR
+    cfg.MODEL.DETR.NUM_QUERY_POSITION = 300
+    cfg.MODEL.DETR.NUM_QUERY_PATTERN = 3
+    cfg.MODEL.DETR.SPATIAL_PRIOR = "learned"
 
     cfg.SOLVER.OPTIMIZER = "ADAMW"
     cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1
 
-
     # Input Configs
     # Mosaic part
     _C.INPUT.MOSAIC = CN()
diff --git a/yolov7/configs/config_convnext.py b/yolov7/configs/config_convnext.py
new file mode 100644
index 0000000..328f73a
--- /dev/null
+++ b/yolov7/configs/config_convnext.py
@@ -0,0 +1,9 @@
+
+from detectron2.config import CfgNode as CN
+
+def add_convnext_default_configs(_C):
+    _C.MODEL.CONVNEXT = CN()
+
+    _C.MODEL.CONVNEXT.OUT_FEATURES = ["dark3", "dark4", "dark5"]
+    _C.MODEL.CONVNEXT.WEIGHTS = ""
+    _C.MODEL.CONVNEXT.DEPTH_WISE = False
diff --git a/yolov7/configs/config_sparseinst.py b/yolov7/configs/config_sparseinst.py
new file mode 100644
index 0000000..600b035
--- /dev/null
+++ b/yolov7/configs/config_sparseinst.py
@@ -0,0 +1,68 @@
+# Copyright (c) Tianheng Cheng and its affiliates. All Rights Reserved
+
+from detectron2.config import CfgNode as CN
+
+
+def add_sparse_inst_config(cfg):
+
+    cfg.MODEL.DEVICE = "cuda"
+    cfg.MODEL.MASK_ON = True
+    # [SparseInst]
+    cfg.MODEL.SPARSE_INST = CN()
+
+    # parameters for inference
+    cfg.MODEL.SPARSE_INST.CLS_THRESHOLD = 0.005
+    cfg.MODEL.SPARSE_INST.MASK_THRESHOLD = 0.45
+    cfg.MODEL.SPARSE_INST.MAX_DETECTIONS = 100
+
+    # [Encoder]
+    cfg.MODEL.SPARSE_INST.ENCODER = CN()
+    cfg.MODEL.SPARSE_INST.ENCODER.NAME = "FPNPPMEncoder"
+    cfg.MODEL.SPARSE_INST.ENCODER.NORM = ""
+    cfg.MODEL.SPARSE_INST.ENCODER.IN_FEATURES = ["res3", "res4", "res5"]
+    cfg.MODEL.SPARSE_INST.ENCODER.NUM_CHANNELS = 256
+
+    # [Decoder]
+    cfg.MODEL.SPARSE_INST.DECODER = CN()
+    cfg.MODEL.SPARSE_INST.DECODER.NAME = "BaseIAMDecoder"
+    cfg.MODEL.SPARSE_INST.DECODER.NUM_MASKS = 100
+    cfg.MODEL.SPARSE_INST.DECODER.NUM_CLASSES = 80
+    # kernels for mask features
+    cfg.MODEL.SPARSE_INST.DECODER.KERNEL_DIM = 128
+    # upsample factor for output masks
+    cfg.MODEL.SPARSE_INST.DECODER.SCALE_FACTOR = 2.0
+    cfg.MODEL.SPARSE_INST.DECODER.OUTPUT_IAM = False
+    cfg.MODEL.SPARSE_INST.DECODER.GROUPS = 4
+    # decoder.inst_branch
+    cfg.MODEL.SPARSE_INST.DECODER.INST = CN()
+    cfg.MODEL.SPARSE_INST.DECODER.INST.DIM = 256
+    cfg.MODEL.SPARSE_INST.DECODER.INST.CONVS = 4
+    # decoder.mask_branch
+    cfg.MODEL.SPARSE_INST.DECODER.MASK = CN()
+    cfg.MODEL.SPARSE_INST.DECODER.MASK.DIM = 256
+    cfg.MODEL.SPARSE_INST.DECODER.MASK.CONVS = 4
+
+    # [Loss]
+    cfg.MODEL.SPARSE_INST.LOSS = CN()
+    cfg.MODEL.SPARSE_INST.LOSS.NAME = "SparseInstCriterion"
+    cfg.MODEL.SPARSE_INST.LOSS.ITEMS = ("labels", "masks")
+    # loss weight
+    cfg.MODEL.SPARSE_INST.LOSS.CLASS_WEIGHT = 2.0
+    cfg.MODEL.SPARSE_INST.LOSS.MASK_PIXEL_WEIGHT = 5.0
+    cfg.MODEL.SPARSE_INST.LOSS.MASK_DICE_WEIGHT = 2.0
+    # iou-aware objectness loss weight
+    cfg.MODEL.SPARSE_INST.LOSS.OBJECTNESS_WEIGHT = 1.0
+
+    # [Matcher]
+    cfg.MODEL.SPARSE_INST.MATCHER = CN()
+    cfg.MODEL.SPARSE_INST.MATCHER.NAME = "SparseInstMatcher"
+    cfg.MODEL.SPARSE_INST.MATCHER.ALPHA = 0.8
+    cfg.MODEL.SPARSE_INST.MATCHER.BETA = 0.2
+
+    # [Optimizer]
+    cfg.SOLVER.OPTIMIZER = "ADAMW"
+    cfg.SOLVER.BACKBONE_MULTIPLIER = 1.0
+    cfg.SOLVER.AMSGRAD = False
+
+    # [Dataset mapper]
+    cfg.MODEL.SPARSE_INST.DATASET_MAPPER = "SparseInstDatasetMapper"
diff --git a/yolov7/data/config.py b/yolov7/data/config.py
new file mode 100644
index 0000000..1761525
--- /dev/null
+++ b/yolov7/data/config.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+
+from detectron2.config import CfgNode as CN
+
+
+def add_d2go_data_default_configs(_C):
+    _C.D2GO_DATA = CN()
+
+    # Config for "detectron2go.data.extended_coco.extended_coco_load"
+    _C.D2GO_DATA.DATASETS = CN()
+    # List of class names to use when loading the data, this applies to train
+    # and test separately. Default value means using all classes, otherwise it'll create
+    # new json file containing only given categories.
+    _C.D2GO_DATA.DATASETS.TRAIN_CATEGORIES = ()
+    _C.D2GO_DATA.DATASETS.TEST_CATEGORIES = ()
+
+    # Register a list of COCO datasets in config
+    # The following specifies additional coco data to inject. The required is the
+    # name (NAMES), image root (IM_DIRS), coco json file (JSON_FILES) while keypoint
+    # metadata (KEYPOINT_METADATA) is optional. The keypoint metadata name provided
+    # here is used to lookup the metadata specified within the KEYPOINT_METADATA
+    # metadata registry specified in "data/keypoint_metadata_registry.py". For adding
+    # new use cases, simply register new metadata to that registry.
+    _C.D2GO_DATA.DATASETS.COCO_INJECTION = CN()
+    _C.D2GO_DATA.DATASETS.COCO_INJECTION.NAMES = []
+    _C.D2GO_DATA.DATASETS.COCO_INJECTION.IM_DIRS = []
+    _C.D2GO_DATA.DATASETS.COCO_INJECTION.JSON_FILES = []
+    _C.D2GO_DATA.DATASETS.COCO_INJECTION.KEYPOINT_METADATA = []
+
+    # On-the-fly register a list of datasets located under detectron2go/datasets
+    # by specifying the filename (without .py).
+    _C.D2GO_DATA.DATASETS.DYNAMIC_DATASETS = []
+
+    # TODO: potentially add this config
+    # # List of extra keys in annotation, the item will be forwarded by
+    # # extended_coco_load.
+    # _C.D2GO_DATA.DATASETS.ANNOTATION_FIELDS_TO_FORWARD = ()
+
+    # Config for D2GoDatasetMapper
+    _C.D2GO_DATA.MAPPER = CN()
+    # dataset mapper name
+    _C.D2GO_DATA.MAPPER.NAME = "D2GoDatasetMapper"
+    # When enabled, image item from json dataset doesn't need to have width/hegiht,
+    # they will be backfilled once image is loaded. This may cause issue when
+    # width/hegiht is acutally been used by extended_coco_load, eg. grouping
+    # by aspect ratio.
+    _C.D2GO_DATA.MAPPER.BACKFILL_SIZE = False
+    _C.D2GO_DATA.MAPPER.RETRY = 3
+    _C.D2GO_DATA.MAPPER.CATCH_EXCEPTION = True
+
+    _C.D2GO_DATA.AUG_OPS = CN()
+    # List of transforms that are represented by string. Each string starts with
+    # a registered name in TRANSFORM_OP_REGISTRY, optionally followed by a string
+    # argument (separated by "::") which can be used for initializing the
+    # transform object. See build_transform_gen for the detail.
+    # Some examples are:
+    # example 1: RandomFlipOp
+    # example 2: RandomFlipOp::{}
+    # example 3: RandomFlipOp::{"prob":0.5}
+    # example 4: RandomBrightnessOp::{"intensity_min":1.0, "intensity_max":2.0}
+    # NOTE: search "example repr:" in fbcode for examples.
+    _C.D2GO_DATA.AUG_OPS.TRAIN = ["ResizeShortestEdgeOp", "RandomFlipOp"]
+    _C.D2GO_DATA.AUG_OPS.TEST = ["ResizeShortestEdgeOp"]
+
+    _C.D2GO_DATA.TEST = CN()
+    # Evaluate on the first specified number of images for each datset during
+    # testing, default value 0 means using all images.
+    # NOTE: See maybe_subsample_n_images for details.
+    _C.D2GO_DATA.TEST.MAX_IMAGES = 0
+    _C.D2GO_DATA.TEST.SUBSET_SAMPLING = "frontmost"  # one of {"frontmost", "random"}
+
+    return _C
diff --git a/yolov7/data/dataset_mapper.py b/yolov7/data/dataset_mapper.py
old mode 100755
new mode 100644
diff --git a/yolov7/data/detection_utils.py b/yolov7/data/detection_utils.py
old mode 100755
new mode 100644
index c88537f..d4c9b72
--- a/yolov7/data/detection_utils.py
+++ b/yolov7/data/detection_utils.py
@@ -13,6 +13,7 @@
 from detectron2.data.transforms import RandomFlip, RandomBrightness, RandomLighting, RandomSaturation
 from detectron2.data.transforms import RandomFlip
 from alfred.vis.image.det import visualize_det_cv2_part, visualize_det_cv2_fancy
+from pycocotools import mask as mask_util
 
 
 def build_augmentation(cfg, is_train):
@@ -170,6 +171,29 @@ def transform_instance_annotations(
     bbox = transforms.apply_box(np.array([bbox]))[0].clip(min=0)
     annotation["bbox"] = np.minimum(bbox, list(image_size + image_size)[::-1])
     annotation["bbox_mode"] = BoxMode.XYXY_ABS
+    
+    # apply transforms to segmentation
+    if "segmentation" in annotation:
+        # each instance contains 1 or more polygons
+        segm = annotation["segmentation"]
+        if isinstance(segm, list):
+            # polygons
+            polygons = [np.asarray(p).reshape(-1, 2) for p in segm]
+            annotation["segmentation"] = [
+                p.reshape(-1) for p in transforms.apply_polygons(polygons)
+            ]
+        elif isinstance(segm, dict):
+            # RLE
+            mask = mask_util.decode(segm)
+            mask = transforms.apply_segmentation(mask)
+            assert tuple(mask.shape[:2]) == image_size
+            annotation["segmentation"] = mask
+        else:
+            raise ValueError(
+                "Cannot transform segmentation of type '{}'!"
+                "Supported types are: polygons as list[list[float] or ndarray],"
+                " COCO-style RLE as a dict.".format(type(segm))
+            )
 
     # add meta_infos
     if add_meta_infos:
diff --git a/yolov7/data/transforms/__init__.py b/yolov7/data/transforms/__init__.py
old mode 100755
new mode 100644
diff --git a/yolov7/data/transforms/augmentation_impl.py b/yolov7/data/transforms/augmentation_impl.py
old mode 100755
new mode 100644
diff --git a/yolov7/data/transforms/data_augment.py b/yolov7/data/transforms/data_augment.py
old mode 100755
new mode 100644
diff --git a/yolov7/data/transforms/transform.py b/yolov7/data/transforms/transform.py
old mode 100755
new mode 100644
diff --git a/yolov7/evaluation/coco_evaluation.py b/yolov7/evaluation/coco_evaluation.py
new file mode 100644
index 0000000..511f893
--- /dev/null
+++ b/yolov7/evaluation/coco_evaluation.py
@@ -0,0 +1,99 @@
+"""Code from https://github.com/hustvl/SparseInst/blob/8e75c646233822ee751253c799eb58226eb5f577/sparseinst/coco_evaluation.py#L24
+
+Models such as SparseInst are able to predict segmentation masks without predicting bounding boxes first.
+However, objects of class 'Instance' in detectrin2 require an attribute 'pred_boxes'.
+We overwrite here the function instances_to_coco_json to allow Instances without pred_boxes.
+"""
+
+import numpy as np
+import pycocotools.mask as mask_util
+from detectron2.structures import BoxMode
+from detectron2.evaluation import COCOEvaluator
+
+
+def instances_to_coco_json(instances, img_id):
+    """
+    Dump an "Instances" object to a COCO-format json that's used for evaluation.
+    Args:
+        instances (Instances):
+        img_id (int): the image id
+    Returns:
+        list[dict]: list of json annotations in COCO format.
+    """
+    num_instance = len(instances)
+    if num_instance == 0:
+        return []
+
+    # NOTE: pure instance segmentation
+    has_box = instances.has("pred_boxes")
+    if has_box:
+        boxes = instances.pred_boxes.tensor.numpy()
+        boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+        boxes = boxes.tolist()
+
+    scores = instances.scores.tolist()
+    classes = instances.pred_classes.tolist()
+
+    has_mask = instances.has("pred_masks")
+    if has_mask:
+        # use RLE to encode the masks, because they are too large and takes memory
+        # since this evaluator stores outputs of the entire dataset
+        rles = [
+            mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0]
+            for mask in instances.pred_masks
+        ]
+        for rle in rles:
+            # "counts" is an array encoded by mask_util as a byte-stream. Python3's
+            # json writer which always produces strings cannot serialize a bytestream
+            # unless you decode it. Thankfully, utf-8 works out (which is also what
+            # the pycocotools/_mask.pyx does).
+            rle["counts"] = rle["counts"].decode("utf-8")
+
+    has_keypoints = instances.has("pred_keypoints")
+    if has_keypoints:
+        keypoints = instances.pred_keypoints
+
+    results = []
+    for k in range(num_instance):
+        result = {
+            "image_id": img_id,
+            "category_id": classes[k],
+            "score": scores[k],
+        }
+        if has_box:
+            result["bbox"] = boxes[k]
+        if has_mask:
+            result["segmentation"] = rles[k]
+        if has_keypoints:
+            # In COCO annotations,
+            # keypoints coordinates are pixel indices.
+            # However our predictions are floating point coordinates.
+            # Therefore we subtract 0.5 to be consistent with the annotation format.
+            # This is the inverse of data loading logic in `datasets/coco.py`.
+            keypoints[k][:, :2] -= 0.5
+            result["keypoints"] = keypoints[k].flatten().tolist()
+        results.append(result)
+    return results
+
+# Also overload the function 'process' from COCOEvaluator, so that it calls the newly defined function instances_to_coco_json
+class COCOMaskEvaluator(COCOEvaluator):
+
+    def process(self, inputs, outputs):
+        """
+        Args:
+            inputs: the inputs to a COCO model (e.g., GeneralizedRCNN).
+                It is a list of dict. Each dict corresponds to an image and
+                contains keys like "height", "width", "file_name", "image_id".
+            outputs: the outputs of a COCO model. It is a list of dicts with key
+                "instances" that contains :class:`Instances`.
+        """
+        for input, output in zip(inputs, outputs):
+            prediction = {"image_id": input["image_id"]}
+
+            if "instances" in output:
+                instances = output["instances"].to(self._cpu_device)
+                prediction["instances"] = instances_to_coco_json(instances, input["image_id"])
+            if "proposals" in output:
+                prediction["proposals"] = output["proposals"].to(self._cpu_device)
+            if len(prediction) > 1:
+                self._predictions.append(prediction)
\ No newline at end of file
diff --git a/yolov7/modeling/__init__.py b/yolov7/modeling/__init__.py
old mode 100755
new mode 100644
diff --git a/yolov7/modeling/backbone/__init__.py b/yolov7/modeling/backbone/__init__.py
old mode 100755
new mode 100644
index e46907c..8cd6d0d
--- a/yolov7/modeling/backbone/__init__.py
+++ b/yolov7/modeling/backbone/__init__.py
@@ -7,4 +7,10 @@
 from .res2nets.wrapper import build_res2net_backbone
 
 from .darknetx import build_cspdarknetx_backbone
-from .regnet import build_regnet_backbone
\ No newline at end of file
+from .regnet import build_regnet_backbone
+from .fbnet_v3 import *
+from .fbnet_v2 import FBNetV2C4Backbone, build_fbnet
+from .resnetvd import build_resnet_vd_backbone
+
+from .convnext import build_convnext_backbone
+from .efficientrep import build_efficientrep_backbone
\ No newline at end of file
diff --git a/yolov7/modeling/backbone/anchordetr_backbone.py b/yolov7/modeling/backbone/anchordetr_backbone.py
old mode 100755
new mode 100644
index e69de29..4e72cf5
--- a/yolov7/modeling/backbone/anchordetr_backbone.py
+++ b/yolov7/modeling/backbone/anchordetr_backbone.py
@@ -0,0 +1,444 @@
+# ------------------------------------------------------------------------
+# Copyright (c) 2021 megvii-model. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+import copy
+from typing import Optional, List
+import math
+
+import torch
+import torch.nn.functional as F
+from torch import nn, Tensor
+
+from yolov7.utils.misc import inverse_sigmoid
+
+from .layers.row_column_decoupled_attention import MultiheadRCDA
+
+
+class Transformer(nn.Module):
+    def __init__(self,  num_classes=91, d_model=256, nhead=8,
+                 num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=1024, dropout=0.,
+                 activation="relu", num_feature_levels=3, num_query_position=300, num_query_pattern=3,
+                 spatial_prior="learned", attention_type="RCDA"):
+        super().__init__()
+
+        self.d_model = d_model
+        self.nhead = nhead
+
+        self.attention_type = attention_type
+        encoder_layer = TransformerEncoderLayerSpatial(d_model, dim_feedforward,
+                                                       dropout, activation, nhead, attention_type)
+        encoder_layer_level = TransformerEncoderLayerLevel(d_model, dim_feedforward,
+                                                           dropout, activation, nhead)
+
+        decoder_layer = TransformerDecoderLayer(d_model, dim_feedforward,
+                                                dropout, activation, nhead,
+                                                num_feature_levels, attention_type)
+
+        if num_feature_levels == 1:
+            self.num_encoder_layers_level = 0
+        else:
+            self.num_encoder_layers_level = num_encoder_layers // 2
+        self.num_encoder_layers_spatial = num_encoder_layers - self.num_encoder_layers_level
+
+        self.encoder_layers = _get_clones(
+            encoder_layer, self.num_encoder_layers_spatial)
+        self.encoder_layers_level = _get_clones(
+            encoder_layer_level, self.num_encoder_layers_level)
+        self.decoder_layers = _get_clones(decoder_layer, num_decoder_layers)
+
+        self.spatial_prior = spatial_prior
+
+        if num_feature_levels > 1:
+            self.level_embed = nn.Embedding(num_feature_levels, d_model)
+        self.num_pattern = num_query_pattern
+        self.pattern = nn.Embedding(self.num_pattern, d_model)
+
+        self.num_position = num_query_position
+        if self.spatial_prior == "learned":
+            self.position = nn.Embedding(self.num_position, 2)
+
+        self.adapt_pos2d = nn.Sequential(
+            nn.Linear(d_model, d_model),
+            nn.ReLU(),
+            nn.Linear(d_model, d_model),
+        )
+        self.adapt_pos1d = nn.Sequential(
+            nn.Linear(d_model, d_model),
+            nn.ReLU(),
+            nn.Linear(d_model, d_model),
+        )
+
+        self.num_layers = num_decoder_layers
+        self.num_classes = num_classes
+
+        self.class_embed = nn.Linear(d_model, self.num_classes)
+        self.bbox_embed = MLP(d_model, d_model, 4, 3)
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        num_pred = self.num_layers
+        num_classes = self.num_classes
+        prior_prob = 0.01
+        bias_value = -math.log((1 - prior_prob) / prior_prob)
+        self.class_embed.bias.data = torch.ones(num_classes) * bias_value
+
+        nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0)
+        if self.spatial_prior == "learned":
+            nn.init.uniform_(self.position.weight.data, 0, 1)
+
+        nn.init.constant_(self.bbox_embed.layers[-1].bias.data[2:], -2.0)
+        self.class_embed = nn.ModuleList(
+            [self.class_embed for _ in range(num_pred)])
+        self.bbox_embed = nn.ModuleList(
+            [self.bbox_embed for _ in range(num_pred)])
+
+    def forward(self, srcs, masks):
+
+        # prepare input for decoder
+        bs, l, c, h, w = srcs.shape
+
+        if self.spatial_prior == "learned":
+            reference_points = self.position.weight.unsqueeze(
+                0).repeat(bs, self.num_pattern, 1)
+        elif self.spatial_prior == "grid":
+            nx = ny = round(math.sqrt(self.num_position))
+            self.num_position = nx*ny
+            x = (torch.arange(nx) + 0.5) / nx
+            y = (torch.arange(ny) + 0.5) / ny
+            xy = torch.meshgrid(x, y)
+            reference_points = torch.cat(
+                [xy[0].reshape(-1)[..., None], xy[1].reshape(-1)[..., None]], -1).cuda()
+            reference_points = reference_points.unsqueeze(
+                0).repeat(bs, self.num_pattern, 1)
+        else:
+            raise ValueError(f'unknown {self.spatial_prior} spatial prior')
+
+        tgt = self.pattern.weight.reshape(1, self.num_pattern, 1, c).repeat(bs, 1, self.num_position, 1).reshape(
+            bs, self.num_pattern * self.num_position, c)
+
+        mask = masks[-1].unsqueeze(1).repeat(1, l, 1, 1).reshape(bs*l, h, w)
+        pos_col, pos_row = mask2pos(mask)
+        if self.attention_type == "RCDA":
+            posemb_row = self.adapt_pos1d(pos2posemb1d(pos_row))
+            posemb_col = self.adapt_pos1d(pos2posemb1d(pos_col))
+            posemb_2d = None
+        else:
+            pos_2d = torch.cat([pos_row.unsqueeze(1).repeat(
+                1, h, 1).unsqueeze(-1), pos_col.unsqueeze(2).repeat(1, 1, w).unsqueeze(-1)], dim=-1)
+            posemb_2d = self.adapt_pos2d(pos2posemb2d(pos_2d))
+            posemb_row = posemb_col = None
+
+        outputs = srcs.reshape(bs * l, c, h, w)
+
+        for idx in range(len(self.encoder_layers)):
+            outputs = self.encoder_layers[idx](
+                outputs, mask, posemb_row, posemb_col, posemb_2d)
+            if idx < self.num_encoder_layers_level:
+                outputs = self.encoder_layers_level[idx](outputs, level_emb=self.level_embed.weight.unsqueeze(
+                    1).unsqueeze(0).repeat(bs, 1, 1, 1).reshape(bs*l, 1, c))
+
+        srcs = outputs.reshape(bs, l, c, h, w)
+
+        output = tgt
+
+        outputs_classes = []
+        outputs_coords = []
+        for lid, layer in enumerate(self.decoder_layers):
+            output = layer(output, reference_points, srcs, mask, adapt_pos2d=self.adapt_pos2d,
+                           adapt_pos1d=self.adapt_pos1d, posemb_row=posemb_row, posemb_col=posemb_col, posemb_2d=posemb_2d)
+            reference = inverse_sigmoid(reference_points)
+            outputs_class = self.class_embed[lid](output)
+            tmp = self.bbox_embed[lid](output)
+            if reference.shape[-1] == 4:
+                tmp += reference
+            else:
+                assert reference.shape[-1] == 2
+                tmp[..., :2] += reference
+            outputs_coord = tmp.sigmoid()
+            outputs_classes.append(outputs_class[None, ])
+            outputs_coords.append(outputs_coord[None, ])
+
+        output = torch.cat(outputs_classes, dim=0), torch.cat(
+            outputs_coords, dim=0)
+
+        return output
+
+
+class TransformerEncoderLayerSpatial(nn.Module):
+    def __init__(self,
+                 d_model=256, d_ffn=1024,
+                 dropout=0., activation="relu",
+                 n_heads=8, attention_type="RCDA"):
+        super().__init__()
+
+        self.attention_type = attention_type
+        if attention_type == "RCDA":
+            attention_module = MultiheadRCDA
+        elif attention_type == "nn.MultiheadAttention":
+            attention_module = nn.MultiheadAttention
+        else:
+            raise ValueError(f'unknown {attention_type} attention_type')
+
+        # self attention
+        self.self_attn = attention_module(d_model, n_heads, dropout=dropout)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_model)
+
+        # ffn
+        self.ffn = FFN(d_model, d_ffn, dropout, activation)
+
+    @staticmethod
+    def with_pos_embed(tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward(self, src, padding_mask=None, posemb_row=None, posemb_col=None, posemb_2d=None):
+        # self attention
+        bz, c, h, w = src.shape
+        src = src.permute(0, 2, 3, 1)
+
+        if self.attention_type == "RCDA":
+            posemb_row = posemb_row.unsqueeze(1).repeat(1, h, 1, 1)
+            posemb_col = posemb_col.unsqueeze(2).repeat(1, 1, w, 1)
+            src2 = self.self_attn((src + posemb_row).reshape(bz, h * w, c), (src + posemb_col).reshape(bz, h * w, c),
+                                  src + posemb_row, src + posemb_col,
+                                  src, key_padding_mask=padding_mask)[0].transpose(0, 1).reshape(bz, h, w, c)
+        else:
+            src2 = self.self_attn((src + posemb_2d).reshape(bz, h * w, c).transpose(0, 1),
+                                  (src + posemb_2d).reshape(bz,
+                                                            h * w, c).transpose(0, 1),
+                                  src.reshape(bz, h * w, c).transpose(0, 1))[0].transpose(0, 1).reshape(bz, h, w, c)
+
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+
+        # ffn
+        src = self.ffn(src)
+        src = src.permute(0, 3, 1, 2)
+        return src
+
+
+class TransformerEncoderLayerLevel(nn.Module):
+    def __init__(self,
+                 d_model=256, d_ffn=1024,
+                 dropout=0., activation="relu",
+                 n_heads=8):
+        super().__init__()
+
+        # self attention
+        self.self_attn_level = nn.MultiheadAttention(
+            d_model, n_heads, dropout=dropout)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_model)
+
+        # ffn
+        self.ffn = FFN(d_model, d_ffn, dropout, activation)
+
+    @staticmethod
+    def with_pos_embed(tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward(self, src, level_emb=0):
+        # self attention
+        bz, c, h, w = src.shape
+        src = src.permute(0, 2, 3, 1)
+
+        src2 = self.self_attn_level(src.reshape(bz, h * w, c) + level_emb, src.reshape(bz, h * w, c) + level_emb,
+                                    src.reshape(bz, h * w, c))[0].reshape(bz, h, w, c)
+
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+
+        # ffn
+        src = self.ffn(src)
+        src = src.permute(0, 3, 1, 2)
+        return src
+
+
+class TransformerDecoderLayer(nn.Module):
+    def __init__(self, d_model=256, d_ffn=1024,
+                 dropout=0., activation="relu", n_heads=8,
+                 n_levels=3, attention_type="RCDA"):
+        super().__init__()
+
+        self.attention_type = attention_type
+        self.attention_type = attention_type
+        if attention_type == "RCDA":
+            attention_module = MultiheadRCDA
+        elif attention_type == "nn.MultiheadAttention":
+            attention_module = nn.MultiheadAttention
+        else:
+            raise ValueError(f'unknown {attention_type} attention_type')
+
+        # cross attention
+        self.cross_attn = attention_module(d_model, n_heads, dropout=dropout)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_model)
+
+        # self attention
+        self.self_attn = nn.MultiheadAttention(
+            d_model, n_heads, dropout=dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(d_model)
+
+        # level combination
+        if n_levels > 1:
+            self.level_fc = nn.Linear(d_model * n_levels, d_model)
+
+        # ffn
+        self.ffn = FFN(d_model, d_ffn, dropout, activation)
+
+    @staticmethod
+    def with_pos_embed(tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward(self, tgt, reference_points, srcs, src_padding_masks=None, adapt_pos2d=None,
+                adapt_pos1d=None, posemb_row=None, posemb_col=None, posemb_2d=None):
+        tgt_len = tgt.shape[1]
+
+        query_pos = pos2posemb2d(reference_points.squeeze(2))
+        query_pos = adapt_pos2d(query_pos)
+        # self attention
+        q = k = self.with_pos_embed(tgt, query_pos)
+        tgt2 = self.self_attn(q.transpose(0, 1), k.transpose(
+            0, 1), tgt.transpose(0, 1))[0].transpose(0, 1)
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+
+        bz, l, c, h, w = srcs.shape
+        srcs = srcs.reshape(bz * l, c, h, w).permute(0, 2, 3, 1)
+
+        if self.attention_type == "RCDA":
+            query_pos_x = adapt_pos1d(pos2posemb1d(reference_points[..., 0]))
+            query_pos_y = adapt_pos1d(pos2posemb1d(reference_points[..., 1]))
+            posemb_row = posemb_row.unsqueeze(1).repeat(1, h, 1, 1)
+            posemb_col = posemb_col.unsqueeze(2).repeat(1, 1, w, 1)
+            src_row = src_col = srcs
+            k_row = src_row + posemb_row
+            k_col = src_col + posemb_col
+            tgt2 = self.cross_attn((tgt + query_pos_x).repeat(l, 1, 1), (tgt + query_pos_y).repeat(l, 1, 1), k_row, k_col,
+                                   srcs, key_padding_mask=src_padding_masks)[0].transpose(0, 1)
+        else:
+            tgt2 = self.cross_attn((tgt + query_pos).repeat(l, 1, 1).transpose(0, 1),
+                                   (srcs + posemb_2d).reshape(bz *
+                                                              l, h * w, c).transpose(0, 1),
+                                   srcs.reshape(bz * l, h * w, c).transpose(0, 1))[0].transpose(0, 1)
+
+        if l > 1:
+            tgt2 = self.level_fc(tgt2.reshape(bz, l, tgt_len, c).permute(
+                0, 2, 3, 1).reshape(bz, tgt_len, c * l))
+
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+
+        # ffn
+        tgt = self.ffn(tgt)
+
+        return tgt
+
+
+class FFN(nn.Module):
+
+    def __init__(self, d_model=256, d_ffn=1024, dropout=0., activation='relu'):
+        super().__init__()
+        self.linear1 = nn.Linear(d_model, d_ffn)
+        self.activation = _get_activation_fn(activation)
+        self.dropout2 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(d_ffn, d_model)
+        self.dropout3 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(d_model)
+
+    def forward(self, src):
+        src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
+        src = src + self.dropout3(src2)
+        src = self.norm2(src)
+        return src
+
+
+class MLP(nn.Module):
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k)
+                                    for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+
+def _get_activation_fn(activation):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
+
+
+def build_transformer(args):
+    return Transformer(
+        d_model=args.hidden_dim,
+        nhead=args.nheads,
+        num_encoder_layers=args.enc_layers,
+        num_decoder_layers=args.dec_layers,
+        dim_feedforward=args.dim_feedforward,
+        dropout=args.dropout,
+        activation="relu",
+        num_feature_levels=args.num_feature_levels,
+        num_query_position=args.num_query_position,
+        num_query_pattern=args.num_query_pattern,
+        spatial_prior=args.spatial_prior,
+        attention_type=args.attention_type,
+    )
+
+
+def pos2posemb2d(pos, num_pos_feats=128, temperature=10000):
+    scale = 2 * math.pi
+    pos = pos * scale
+    dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos.device)
+    dim_t = temperature ** (2 * (dim_t // 2) / num_pos_feats)
+    pos_x = pos[..., 0, None] / dim_t
+    pos_y = pos[..., 1, None] / dim_t
+    pos_x = torch.stack(
+        (pos_x[..., 0::2].sin(), pos_x[..., 1::2].cos()), dim=-1).flatten(-2)
+    pos_y = torch.stack(
+        (pos_y[..., 0::2].sin(), pos_y[..., 1::2].cos()), dim=-1).flatten(-2)
+    posemb = torch.cat((pos_y, pos_x), dim=-1)
+    return posemb
+
+
+def pos2posemb1d(pos, num_pos_feats=256, temperature=10000):
+    scale = 2 * math.pi
+    pos = pos * scale
+    dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos.device)
+    dim_t = temperature ** (2 * (dim_t // 2) / num_pos_feats)
+    pos_x = pos[..., None] / dim_t
+    posemb = torch.stack(
+        (pos_x[..., 0::2].sin(), pos_x[..., 1::2].cos()), dim=-1).flatten(-2)
+    return posemb
+
+
+def mask2pos(mask):
+    not_mask = ~mask
+    y_embed = not_mask[:, :, 0].cumsum(1, dtype=torch.float32)
+    x_embed = not_mask[:, 0, :].cumsum(1, dtype=torch.float32)
+    y_embed = (y_embed - 0.5) / y_embed[:, -1:]
+    x_embed = (x_embed - 0.5) / x_embed[:, -1:]
+    return y_embed, x_embed
diff --git a/yolov7/modeling/backbone/cfg.py b/yolov7/modeling/backbone/cfg.py
new file mode 100644
index 0000000..183dada
--- /dev/null
+++ b/yolov7/modeling/backbone/cfg.py
@@ -0,0 +1,46 @@
+from detectron2.config import CfgNode as CN
+
+
+def add_fbnet_v2_default_configs(_C):
+    _C.MODEL.FBNET_V2 = CN()
+
+    _C.MODEL.FBNET_V2.ARCH = "default"
+    _C.MODEL.FBNET_V2.ARCH_DEF = []
+    # number of channels input to trunk
+    _C.MODEL.FBNET_V2.STEM_IN_CHANNELS = 3
+    _C.MODEL.FBNET_V2.SCALE_FACTOR = 1.0
+    # the output channels will be divisible by WIDTH_DIVISOR
+    _C.MODEL.FBNET_V2.WIDTH_DIVISOR = 1
+
+    # normalization configs
+    # name of norm such as "bn", "sync_bn", "gn"
+    _C.MODEL.FBNET_V2.NORM = "bn"
+    # for advanced use case that requries extra arguments, passing a list of
+    # dict such as [{"num_groups": 8}, {"momentum": 0.1}] (merged in given order).
+    # Note that string written it in .yaml will be evaluated by yacs, thus this
+    # node will become normal python object.
+    # https://github.com/rbgirshick/yacs/blob/master/yacs/config.py#L410
+    _C.MODEL.FBNET_V2.NORM_ARGS = []
+
+    _C.MODEL.VT_FPN = CN()
+
+    _C.MODEL.VT_FPN.IN_FEATURES = ["res2", "res3", "res4", "res5"]
+    _C.MODEL.VT_FPN.OUT_CHANNELS = 256
+    _C.MODEL.VT_FPN.LAYERS = 3
+    _C.MODEL.VT_FPN.TOKEN_LS = [16, 16, 8, 8]
+    _C.MODEL.VT_FPN.TOKEN_C = 1024
+    _C.MODEL.VT_FPN.HEADS = 16
+    _C.MODEL.VT_FPN.MIN_GROUP_PLANES = 64
+    _C.MODEL.VT_FPN.NORM = "BN"
+    _C.MODEL.VT_FPN.POS_HWS = []
+    _C.MODEL.VT_FPN.POS_N_DOWNSAMPLE = []
+
+
+
+def add_convnext_default_configs(_C):
+    _C.MODEL.CONVNEXT = CN()
+
+    _C.MODEL.CONVNEXT.OUT_FEATURES = ["dark3", "dark4", "dark5"]
+    _C.MODEL.CONVNEXT.WEIGHTS = ""
+    _C.MODEL.CONVNEXT.DEPTH_WISE = False
+
diff --git a/yolov7/modeling/backbone/convnext.py b/yolov7/modeling/backbone/convnext.py
new file mode 100644
index 0000000..a557cc7
--- /dev/null
+++ b/yolov7/modeling/backbone/convnext.py
@@ -0,0 +1,230 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from functools import partial
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from timm.models.layers import trunc_normal_, DropPath
+
+# from mmcv_custom import load_checkpoint
+# from mmcv.runner
+# from mmdet.utils import get_root_logger
+
+from detectron2.modeling.backbone import Backbone, BACKBONE_REGISTRY
+from torch.utils.model_zoo import load_url as load_state_dict_from_url
+from alfred import logger
+from detectron2.layers import ShapeSpec
+
+
+class Block(nn.Module):
+    r""" ConvNeXt Block. There are two equivalent implementations:
+    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+    We use (2) as we find it slightly faster in PyTorch
+    
+    Args:
+        dim (int): Number of input channels.
+        drop_path (float): Stochastic depth rate. Default: 0.0
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+    """
+    def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6):
+        super().__init__()
+        self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv
+        self.norm = LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(dim, 4 * dim) # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(4 * dim, dim)
+        self.gamma = nn.Parameter(layer_scale_init_value * torch.ones((dim)), 
+                                    requires_grad=True) if layer_scale_init_value > 0 else None
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+    def forward(self, x):
+        input = x
+        x = self.dwconv(x)
+        x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C)
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W)
+
+        x = input + self.drop_path(x)
+        return x
+
+class ConvNeXt(Backbone):
+    r""" ConvNeXt
+        A PyTorch impl of : `A ConvNet for the 2020s`  -
+          https://arxiv.org/pdf/2201.03545.pdf
+
+    Args:
+        in_chans (int): Number of input image channels. Default: 3
+        num_classes (int): Number of classes for classification head. Default: 1000
+        depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
+        dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
+        drop_path_rate (float): Stochastic depth rate. Default: 0.
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+        head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1.
+    """
+    def __init__(self, in_chans=3, depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], 
+                 drop_path_rate=0., layer_scale_init_value=1e-6, out_indices=[0, 1, 2, 3],
+                 ):
+        super().__init__()
+        self.output_shape_dict = dict()
+
+        self.downsample_layers = nn.ModuleList() # stem and 3 intermediate downsampling conv layers
+        stem = nn.Sequential(
+            nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4),
+            LayerNorm(dims[0], eps=1e-6, data_format="channels_first")
+        )
+        self.downsample_layers.append(stem)
+        for i in range(3):
+            downsample_layer = nn.Sequential(
+                    LayerNorm(dims[i], eps=1e-6, data_format="channels_first"),
+                    nn.Conv2d(dims[i], dims[i+1], kernel_size=2, stride=2),
+            )
+            self.downsample_layers.append(downsample_layer)
+
+        self.stages = nn.ModuleList() # 4 feature resolution stages, each consisting of multiple residual blocks
+        dp_rates=[x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] 
+        cur = 0
+        for i in range(4):
+            stage = nn.Sequential(
+                *[Block(dim=dims[i], drop_path=dp_rates[cur + j], 
+                layer_scale_init_value=layer_scale_init_value) for j in range(depths[i])]
+            )
+            self.stages.append(stage)
+            cur += depths[i]
+
+        self.out_indices = out_indices
+
+        norm_layer = partial(LayerNorm, eps=1e-6, data_format="channels_first")
+        for i_layer in range(4):
+            layer = norm_layer(dims[i_layer])
+            layer_name = f'norm{i_layer}'
+            self.add_module(layer_name, layer)
+
+            self.output_shape_dict[i_layer] = ShapeSpec(channels=dims[i_layer])
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv2d, nn.Linear)):
+            trunc_normal_(m.weight, std=.02)
+            nn.init.constant_(m.bias, 0)
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+
+        def _init_weights(m):
+            if isinstance(m, nn.Linear):
+                trunc_normal_(m.weight, std=.02)
+                if isinstance(m, nn.Linear) and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.constant_(m.bias, 0)
+                nn.init.constant_(m.weight, 1.0)
+
+        if isinstance(pretrained, str):
+            self.apply(_init_weights)
+            logger.info(f'passing loading weights: {_init_weights}')
+            # logger = get_root_logger()
+            # load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            self.apply(_init_weights)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward_features(self, x):
+        outs = []
+        for i in range(4):
+            x = self.downsample_layers[i](x)
+            x = self.stages[i](x)
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                x_out = norm_layer(x)
+                outs.append(x_out)
+
+        return tuple(outs)
+
+    def output_shape(self):
+        # self.output_shape_dict["res5"] = ShapeSpec(
+        #     channels=1024, stride=16 if self.res5_dilation == 2 else 32
+        # )
+        return self.output_shape_dict
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        return x
+    
+    @property
+    def size_divisibility(self) -> int:
+        """
+        Some backbones require the input height and width to be divisible by a
+        specific integer. This is typically true for encoder / decoder type networks
+        with lateral connection (e.g., FPN) for which feature maps need to match
+        dimension in the "bottom up" and "top down" paths. Set to 0 if no specific
+        input size divisibility is required.
+        """
+        return 32
+
+class LayerNorm(nn.Module):
+    r""" LayerNorm that supports two data formats: channels_last (default) or channels_first. 
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with 
+    shape (batch_size, height, width, channels) while channels_first corresponds to inputs 
+    with shape (batch_size, channels, height, width).
+    """
+    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.eps = eps
+        self.data_format = data_format
+        if self.data_format not in ["channels_last", "channels_first"]:
+            raise NotImplementedError 
+        self.normalized_shape = (normalized_shape, )
+    
+    def forward(self, x):
+        if self.data_format == "channels_last":
+            return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
+        elif self.data_format == "channels_first":
+            u = x.mean(1, keepdim=True)
+            s = (x - u).pow(2).mean(1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.eps)
+            x = self.weight[:, None, None] * x + self.bias[:, None, None]
+            return x
+
+
+@BACKBONE_REGISTRY.register()
+def build_convnext_backbone(cfg, input_shape=None):
+    
+    depth = cfg.MODEL.DARKNET.DEPTH
+    out_fea_indices = cfg.MODEL.CONVNEXT.OUT_FEATURES
+    if len(out_fea_indices) == 3:
+        out_indices = [0, 1, 2]
+    elif len(out_fea_indices) == 4:
+        out_indices = [0, 1, 2, 3]
+    else:
+        out_indices = [0, 1, 2, 3]
+    
+    # hack here
+
+    return ConvNeXt(
+        in_chans=3,
+        depths=[3, 3, 9, 3], 
+        dims=[96, 192, 384, 768], 
+        drop_path_rate=0.2,
+        layer_scale_init_value=1e-6,
+        out_indices=out_indices,
+    )
\ No newline at end of file
diff --git a/yolov7/modeling/backbone/cspdarknet.py b/yolov7/modeling/backbone/cspdarknet.py
old mode 100755
new mode 100644
diff --git a/yolov7/modeling/backbone/cspresnet.py b/yolov7/modeling/backbone/cspresnet.py
old mode 100755
new mode 100644
diff --git a/yolov7/modeling/backbone/dabdetr_backbone.py b/yolov7/modeling/backbone/dabdetr_backbone.py
new file mode 100644
index 0000000..e69de29
diff --git a/yolov7/modeling/backbone/darknet.py b/yolov7/modeling/backbone/darknet.py
old mode 100755
new mode 100644
diff --git a/yolov7/modeling/backbone/darknetx.py b/yolov7/modeling/backbone/darknetx.py
old mode 100755
new mode 100644
diff --git a/yolov7/modeling/backbone/detr_backbone.py b/yolov7/modeling/backbone/detr_backbone.py
old mode 100755
new mode 100644
index 045dc30..df59c4b
--- a/yolov7/modeling/backbone/detr_backbone.py
+++ b/yolov7/modeling/backbone/detr_backbone.py
@@ -33,7 +33,8 @@ def __init__(self, d_model=512, nhead=8, num_encoder_layers=6,
         encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward,
                                                 dropout, activation, normalize_before)
         encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
-        self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
+        self.encoder = TransformerEncoder(
+            encoder_layer, num_encoder_layers, encoder_norm)
 
         decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward,
                                                 dropout, activation, normalize_before)
@@ -197,7 +198,8 @@ def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
                  activation="relu", normalize_before=False):
         super().__init__()
         self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
-        self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        self.multihead_attn = nn.MultiheadAttention(
+            d_model, nhead, dropout=dropout)
         # Implementation of Feedforward model
         self.linear1 = nn.Linear(d_model, dim_feedforward)
         self.dropout = nn.Dropout(dropout)
@@ -276,7 +278,6 @@ def forward(self, tgt, memory,
                                  tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
 
 
-
 def _get_clones(module, N):
     return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
 
@@ -310,7 +311,15 @@ class PositionEmbeddingSine(nn.Module):
     This is a more standard version of the position embedding, very similar to the one
     used by the Attention is all you need paper, generalized to work on images.
     """
-    def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
+
+    def __init__(
+        self,
+        num_pos_feats=64,
+        temperature=10000,
+        normalize=False,
+        scale=None,
+        centered=False,
+    ):
         super().__init__()
         self.num_pos_feats = num_pos_feats
         self.temperature = temperature
@@ -320,27 +329,49 @@ def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=N
         if scale is None:
             scale = 2 * math.pi
         self.scale = scale
+        self.centered = centered
 
     def forward(self, tensor_list: NestedTensor):
+        # x shape (B, C, H, W)
         x = tensor_list.tensors
+        # mask shape (B, H, W)
         mask = tensor_list.mask
         assert mask is not None
         not_mask = ~mask
-        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)  # shape (B, H, W)
         x_embed = not_mask.cumsum(2, dtype=torch.float32)
         if self.normalize:
             eps = 1e-6
-            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
-            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
-
-        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
-        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
-
-        pos_x = x_embed[:, :, :, None] / dim_t
+            if self.centered:
+                y_embed = (y_embed - 0.5) / \
+                    (y_embed[:, -1:, :] + eps) * self.scale
+                x_embed = (x_embed - 0.5) / \
+                    (x_embed[:, :, -1:] + eps) * self.scale
+            else:
+                y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+                x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+
+        dim_t = torch.arange(self.num_pos_feats,
+                             dtype=torch.float32, device=x.device)
+        dim_t = self.temperature ** (
+            2 * (dim_t // 2) / self.num_pos_feats
+        )  # shape (N, )
+
+        pos_x = x_embed[:, :, :, None] / dim_t  # shape (B, H, W, N)
         pos_y = y_embed[:, :, :, None] / dim_t
-        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
-        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
-        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        pos_x = torch.stack(
+            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
+        ).flatten(
+            3
+        )  # shape (B, H, W, N)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
+        ).flatten(
+            3
+        )  # shape (B, H, W, N)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(
+            0, 3, 1, 2
+        )  # shape (B, 2*N, H, W)
         return pos
 
 
@@ -348,6 +379,7 @@ class PositionEmbeddingLearned(nn.Module):
     """
     Absolute pos embedding, learned.
     """
+
     def __init__(self, num_pos_feats=256):
         super().__init__()
         self.row_embed = nn.Embedding(50, num_pos_feats)
@@ -365,12 +397,21 @@ def forward(self, tensor_list: NestedTensor):
         j = torch.arange(h, device=x.device)
         x_emb = self.col_embed(i)
         y_emb = self.row_embed(j)
-        pos = torch.cat([
-            x_emb.unsqueeze(0).repeat(h, 1, 1),
-            y_emb.unsqueeze(1).repeat(1, w, 1),
-        ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1)
+        pos = (
+            torch.cat(
+                [
+                    x_emb.unsqueeze(0).repeat(h, 1, 1),
+                    y_emb.unsqueeze(1).repeat(1, w, 1),
+                ],
+                dim=-1,
+            )
+            .permute(2, 0, 1)
+            .unsqueeze(0)
+            .repeat(x.shape[0], 1, 1, 1)
+        )
         return pos
 
+
 class FrozenBatchNorm2d(torch.nn.Module):
     """
     BatchNorm2d where the batch statistics and the affine parameters are fixed.
@@ -418,10 +459,12 @@ def __init__(self, backbone: nn.Module, train_backbone: bool, num_channels: int,
             if not train_backbone or 'layer2' not in name and 'layer3' not in name and 'layer4' not in name:
                 parameter.requires_grad_(False)
         if return_interm_layers:
-            return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3"}
+            return_layers = {"layer1": "0", "layer2": "1",
+                             "layer3": "2", "layer4": "3"}
         else:
             return_layers = {'layer4': "0"}
-        self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
+        self.body = IntermediateLayerGetter(
+            backbone, return_layers=return_layers)
         self.num_channels = num_channels
 
     def forward(self, tensor_list: NestedTensor):
@@ -430,13 +473,15 @@ def forward(self, tensor_list: NestedTensor):
         for name, x in xs.items():
             m = tensor_list.mask
             assert m is not None
-            mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
+            mask = F.interpolate(
+                m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
             out[name] = NestedTensor(x, mask)
         return out
 
 
 class Backbone(BackboneBase):
     """ResNet backbone with frozen BatchNorm."""
+
     def __init__(self, name: str,
                  train_backbone: bool,
                  return_interm_layers: bool,
@@ -451,7 +496,7 @@ def __init__(self, name: str,
 class Joiner(nn.Sequential):
     def __init__(self, backbone, position_embedding):
         super().__init__(backbone, position_embedding)
-    
+
     def prepare_onnx_export(self):
         self[0].onnx_export = True
 
@@ -467,11 +512,29 @@ def forward(self, tensor_list: NestedTensor):
         return out, pos
 
 
+class MLP(nn.Module):
+    """Very simple multi-layer perceptron (also called FFN)"""
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(
+            nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])
+        )
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
 def build_backbone(args):
     position_embedding = build_position_encoding(args)
     train_backbone = args.lr_backbone > 0
     return_interm_layers = args.masks
-    backbone = Backbone(args.backbone, train_backbone, return_interm_layers, args.dilation)
+    backbone = Backbone(args.backbone, train_backbone,
+                        return_interm_layers, args.dilation)
     model = Joiner(backbone, position_embedding)
     model.num_channels = backbone.num_channels
     return model
diff --git a/yolov7/modeling/backbone/dla.py b/yolov7/modeling/backbone/dla.py
old mode 100755
new mode 100644
diff --git a/yolov7/modeling/backbone/dlafpn.py b/yolov7/modeling/backbone/dlafpn.py
old mode 100755
new mode 100644
diff --git a/yolov7/modeling/backbone/efficientnet.py b/yolov7/modeling/backbone/efficientnet.py
old mode 100755
new mode 100644
diff --git a/yolov7/modeling/backbone/efficientrep.py b/yolov7/modeling/backbone/efficientrep.py
new file mode 100644
index 0000000..ff5da0b
--- /dev/null
+++ b/yolov7/modeling/backbone/efficientrep.py
@@ -0,0 +1,538 @@
+from torch import nn
+import warnings
+from pathlib import Path
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from yolov7.utils.checkpoint import load_checkpoint
+from detectron2.layers import ShapeSpec
+from detectron2.modeling import (
+    BACKBONE_REGISTRY,
+    RPN_HEAD_REGISTRY,
+    Backbone,
+)
+from yolov7.utils.misc import make_divisible
+
+
+class SiLU(nn.Module):
+    """Activation of SiLU"""
+
+    @staticmethod
+    def forward(x):
+        return x * torch.sigmoid(x)
+
+
+class Conv(nn.Module):
+    """Normal Conv with SiLU activation"""
+
+    def __init__(
+        self, in_channels, out_channels, kernel_size, stride, groups=1, bias=False
+    ):
+        super().__init__()
+        padding = kernel_size // 2
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            bias=bias,
+        )
+        self.bn = nn.BatchNorm2d(out_channels)
+        self.act = nn.SiLU()
+
+    def forward(self, x):
+        return self.act(self.bn(self.conv(x)))
+
+    def forward_fuse(self, x):
+        return self.act(self.conv(x))
+
+
+class SimConv(nn.Module):
+    """Normal Conv with ReLU activation"""
+
+    def __init__(
+        self, in_channels, out_channels, kernel_size, stride, groups=1, bias=False
+    ):
+        super().__init__()
+        padding = kernel_size // 2
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            bias=bias,
+        )
+        self.bn = nn.BatchNorm2d(out_channels)
+        self.act = nn.ReLU()
+
+    def forward(self, x):
+        return self.act(self.bn(self.conv(x)))
+
+    def forward_fuse(self, x):
+        return self.act(self.conv(x))
+
+
+class SimSPPF(nn.Module):
+    """Simplified SPPF with ReLU activation"""
+
+    def __init__(self, in_channels, out_channels, kernel_size=5):
+        super().__init__()
+        c_ = in_channels // 2  # hidden channels
+        self.cv1 = SimConv(in_channels, c_, 1, 1)
+        self.cv2 = SimConv(c_ * 4, out_channels, 1, 1)
+        self.m = nn.MaxPool2d(
+            kernel_size=kernel_size, stride=1, padding=kernel_size // 2
+        )
+
+    def forward(self, x):
+        x = self.cv1(x)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            y1 = self.m(x)
+            y2 = self.m(y1)
+            return self.cv2(torch.cat([x, y1, y2, self.m(y2)], 1))
+
+
+class Transpose(nn.Module):
+    """Normal Transpose, default for upsampling"""
+
+    def __init__(self, in_channels, out_channels, kernel_size=2, stride=2):
+        super().__init__()
+        self.upsample_transpose = torch.nn.ConvTranspose2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            bias=True,
+        )
+
+    def forward(self, x):
+        return self.upsample_transpose(x)
+
+
+class Concat(nn.Module):
+    def __init__(self, dimension=1):
+        super().__init__()
+        self.d = dimension
+
+    def forward(self, x):
+        return torch.cat(x, self.d)
+
+
+def conv_bn(in_channels, out_channels, kernel_size, stride, padding, groups=1):
+    """Basic cell for rep-style block, including conv and bn"""
+    result = nn.Sequential()
+    result.add_module(
+        "conv",
+        nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            bias=False,
+        ),
+    )
+    result.add_module("bn", nn.BatchNorm2d(num_features=out_channels))
+    return result
+
+
+class RepBlock(nn.Module):
+    """
+    RepBlock is a stage block with rep-style basic block
+    """
+
+    def __init__(self, in_channels, out_channels, n=1):
+        super().__init__()
+        self.conv1 = RepVGGBlock(in_channels, out_channels)
+        self.block = (
+            nn.Sequential(
+                *(RepVGGBlock(out_channels, out_channels) for _ in range(n - 1))
+            )
+            if n > 1
+            else None
+        )
+
+    def forward(self, x):
+        x = self.conv1(x)
+        if self.block is not None:
+            x = self.block(x)
+        return x
+
+
+class RepVGGBlock(nn.Module):
+    """RepVGGBlock is a basic rep-style block, including training and deploy status
+    This code is based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        dilation=1,
+        groups=1,
+        padding_mode="zeros",
+        deploy=False,
+        use_se=False,
+    ):
+        super(RepVGGBlock, self).__init__()
+        """ Intialization of the class.
+        Args:
+            in_channels (int): Number of channels in the input image
+            out_channels (int): Number of channels produced by the convolution
+            kernel_size (int or tuple): Size of the convolving kernel
+            stride (int or tuple, optional): Stride of the convolution. Default: 1
+            padding (int or tuple, optional): Zero-padding added to both sides of
+                the input. Default: 1
+            dilation (int or tuple, optional): Spacing between kernel elements. Default: 1
+            groups (int, optional): Number of blocked connections from input
+                channels to output channels. Default: 1
+            padding_mode (string, optional): Default: 'zeros'
+            deploy: Whether to be deploy status or training status. Default: False
+            use_se: Whether to use se. Default: False
+        """
+        self.deploy = deploy
+        self.groups = groups
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        assert kernel_size == 3
+        assert padding == 1
+
+        padding_11 = padding - kernel_size // 2
+
+        self.nonlinearity = nn.ReLU()
+
+        if use_se:
+            raise NotImplementedError("se block not supported yet")
+        else:
+            self.se = nn.Identity()
+
+        if deploy:
+            self.rbr_reparam = nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                dilation=dilation,
+                groups=groups,
+                bias=True,
+                padding_mode=padding_mode,
+            )
+
+        else:
+            self.rbr_identity = (
+                nn.BatchNorm2d(num_features=in_channels)
+                if out_channels == in_channels and stride == 1
+                else None
+            )
+            self.rbr_dense = conv_bn(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                groups=groups,
+            )
+            self.rbr_1x1 = conv_bn(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=1,
+                stride=stride,
+                padding=padding_11,
+                groups=groups,
+            )
+
+    def forward(self, inputs):
+        """Forward process"""
+        if hasattr(self, "rbr_reparam"):
+            return self.nonlinearity(self.se(self.rbr_reparam(inputs)))
+
+        if self.rbr_identity is None:
+            id_out = 0
+        else:
+            id_out = self.rbr_identity(inputs)
+
+        return self.nonlinearity(
+            self.se(self.rbr_dense(inputs) + self.rbr_1x1(inputs) + id_out)
+        )
+
+    def get_equivalent_kernel_bias(self):
+        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.rbr_dense)
+        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)
+        kernelid, biasid = self._fuse_bn_tensor(self.rbr_identity)
+        return (
+            kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1) + kernelid,
+            bias3x3 + bias1x1 + biasid,
+        )
+
+    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
+        if kernel1x1 is None:
+            return 0
+        else:
+            return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1])
+
+    def _fuse_bn_tensor(self, branch):
+        if branch is None:
+            return 0, 0
+        if isinstance(branch, nn.Sequential):
+            kernel = branch.conv.weight
+            running_mean = branch.bn.running_mean
+            running_var = branch.bn.running_var
+            gamma = branch.bn.weight
+            beta = branch.bn.bias
+            eps = branch.bn.eps
+        else:
+            assert isinstance(branch, nn.BatchNorm2d)
+            if not hasattr(self, "id_tensor"):
+                input_dim = self.in_channels // self.groups
+                kernel_value = np.zeros(
+                    (self.in_channels, input_dim, 3, 3), dtype=np.float32
+                )
+                for i in range(self.in_channels):
+                    kernel_value[i, i % input_dim, 1, 1] = 1
+                self.id_tensor = torch.from_numpy(kernel_value).to(branch.weight.device)
+            kernel = self.id_tensor
+            running_mean = branch.running_mean
+            running_var = branch.running_var
+            gamma = branch.weight
+            beta = branch.bias
+            eps = branch.eps
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape(-1, 1, 1, 1)
+        return kernel * t, beta - running_mean * gamma / std
+
+    def switch_to_deploy(self):
+        if hasattr(self, "rbr_reparam"):
+            return
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.rbr_reparam = nn.Conv2d(
+            in_channels=self.rbr_dense.conv.in_channels,
+            out_channels=self.rbr_dense.conv.out_channels,
+            kernel_size=self.rbr_dense.conv.kernel_size,
+            stride=self.rbr_dense.conv.stride,
+            padding=self.rbr_dense.conv.padding,
+            dilation=self.rbr_dense.conv.dilation,
+            groups=self.rbr_dense.conv.groups,
+            bias=True,
+        )
+        self.rbr_reparam.weight.data = kernel
+        self.rbr_reparam.bias.data = bias
+        for para in self.parameters():
+            para.detach_()
+        self.__delattr__("rbr_dense")
+        self.__delattr__("rbr_1x1")
+        if hasattr(self, "rbr_identity"):
+            self.__delattr__("rbr_identity")
+        if hasattr(self, "id_tensor"):
+            self.__delattr__("id_tensor")
+        self.deploy = True
+
+
+class DetectBackend(nn.Module):
+    def __init__(self, weights="yolov6s.pt", device=None, dnn=True):
+
+        super().__init__()
+        assert (
+            isinstance(weights, str) and Path(weights).suffix == ".pt"
+        ), f"{Path(weights).suffix} format is not supported."
+
+        model = load_checkpoint(weights, map_location=device)
+        stride = int(model.stride.max())
+        self.__dict__.update(locals())  # assign all variables to self
+
+    def forward(self, im, val=False):
+        y = self.model(im)
+        if isinstance(y, np.ndarray):
+            y = torch.tensor(y, device=self.device)
+        return y
+
+
+class EfficientRep(Backbone):
+    """EfficientRep Backbone
+    EfficientRep is handcrafted by hardware-aware neural network design.
+    With rep-style struct, EfficientRep is friendly to high-computation hardware(e.g. GPU).
+    """
+
+    def __init__(
+        self,
+        in_channels=3,
+        channels_list=None,
+        num_repeats=None,
+        out_features=None,
+    ):
+        super().__init__()
+
+        assert channels_list is not None
+        assert num_repeats is not None
+
+        self.channels_list = channels_list
+        self.num_repeats = num_repeats
+
+        self.stem = RepVGGBlock(
+            in_channels=in_channels,
+            out_channels=channels_list[0],
+            kernel_size=3,
+            stride=2,
+        )
+
+        self.ERBlock_2 = nn.Sequential(
+            RepVGGBlock(
+                in_channels=channels_list[0],
+                out_channels=channels_list[1],
+                kernel_size=3,
+                stride=2,
+            ),
+            RepBlock(
+                in_channels=channels_list[1],
+                out_channels=channels_list[1],
+                n=num_repeats[1],
+            ),
+        )
+
+        self.ERBlock_3 = nn.Sequential(
+            RepVGGBlock(
+                in_channels=channels_list[1],
+                out_channels=channels_list[2],
+                kernel_size=3,
+                stride=2,
+            ),
+            RepBlock(
+                in_channels=channels_list[2],
+                out_channels=channels_list[2],
+                n=num_repeats[2],
+            ),
+        )
+
+        self.ERBlock_4 = nn.Sequential(
+            RepVGGBlock(
+                in_channels=channels_list[2],
+                out_channels=channels_list[3],
+                kernel_size=3,
+                stride=2,
+            ),
+            RepBlock(
+                in_channels=channels_list[3],
+                out_channels=channels_list[3],
+                n=num_repeats[3],
+            ),
+        )
+
+        self.ERBlock_5 = nn.Sequential(
+            RepVGGBlock(
+                in_channels=channels_list[3],
+                out_channels=channels_list[4],
+                kernel_size=3,
+                stride=2,
+            ),
+            RepBlock(
+                in_channels=channels_list[4],
+                out_channels=channels_list[4],
+                n=num_repeats[4],
+            ),
+            SimSPPF(
+                in_channels=channels_list[4],
+                out_channels=channels_list[4],
+                kernel_size=5,
+            ),
+        )
+
+        # 64, 128, 256, 512, 1024
+        self._out_feature_strides = {
+            "stride4": 4,
+            "stride8": 8,
+            "stride16": 16,
+            "stride32": 32,
+        }
+        self._out_feature_channels = {
+            k: c
+            for k, c in zip(
+                self._out_feature_strides.keys(),
+                [
+                    channels_list[1],
+                    channels_list[2],
+                    channels_list[3],
+                    channels_list[4],
+                ],
+            )
+        }
+        self.out_features = out_features
+
+    def output_shape(self):
+        return {
+            f"stride{s}": ShapeSpec(channels=self._out_feature_channels[k], stride=s)
+            for k, s in self._out_feature_strides.items()
+        }
+
+    def forward(self, x):
+        outputs = {}
+        x = self.stem(x)
+        x = self.ERBlock_2(x)
+        x = self.ERBlock_3(x)
+        outputs["stride8"] = x
+        x = self.ERBlock_4(x)
+        outputs["stride16"] = x
+        x = self.ERBlock_5(x)
+        outputs["stride32"] = x
+        return outputs
+
+
+@BACKBONE_REGISTRY.register()
+def build_efficientrep_backbone(cfg, input_shape):
+    _out_features = cfg.MODEL.BACKBONE.OUT_FEATURES
+    depth_mul = cfg.MODEL.YOLO.DEPTH_MUL
+    width_mul = cfg.MODEL.YOLO.WIDTH_MUL
+
+    channels_list_backbone = [64, 128, 256, 512, 1024]
+    channels_list_neck = [256, 128, 128, 256, 256, 512]
+    num_repeat_backbone = [1, 6, 12, 18, 6]
+    num_repeat_neck = [12, 12, 12, 12]
+
+    num_repeat = [
+        (max(round(i * depth_mul), 1) if i > 1 else i)
+        for i in (num_repeat_backbone + num_repeat_neck)
+    ]
+    channels_list = [
+        make_divisible(i * width_mul, 8)
+        for i in (channels_list_backbone + channels_list_neck)
+    ]
+
+    # currently only support 3 outputs fixed
+    backbone = EfficientRep(channels_list=channels_list, num_repeats=num_repeat)
+    return backbone
+
+
+@BACKBONE_REGISTRY.register()
+def build_efficientrep_tiny_backbone(cfg, input_shape):
+    _out_features = cfg.MODEL.BACKBONE.OUT_FEATURES
+    depth_mul = cfg.MODEL.YOLO.DEPTH_MUL
+    width_mul = cfg.MODEL.YOLO.WIDTH_MUL
+
+    channels_list_backbone = [64, 128, 256, 512, 1024]
+    channels_list_neck = [256, 128, 128, 256, 256, 512]
+    num_repeat_backbone = [1, 6, 12, 18, 6]
+    num_repeat_neck = [12, 12, 12, 12]
+
+    num_repeat = [
+        (max(round(i * depth_mul), 1) if i > 1 else i)
+        for i in (num_repeat_backbone + num_repeat_neck)
+    ]
+    channels_list = [
+        make_divisible(i * width_mul, 8)
+        for i in (channels_list_backbone + channels_list_neck)
+    ]
+
+    # currently only support 3 outputs fixed
+    backbone = EfficientRep(channels_list=channels_list, num_repeats=num_repeat)
+    return backbone
\ No newline at end of file
diff --git a/yolov7/modeling/backbone/fbnet_v2.py b/yolov7/modeling/backbone/fbnet_v2.py
new file mode 100644
index 0000000..b2e69db
--- /dev/null
+++ b/yolov7/modeling/backbone/fbnet_v2.py
@@ -0,0 +1,530 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+
+import copy
+import itertools
+import logging
+from typing import List
+import copy
+
+import torch
+import torch.nn as nn
+from detectron2.layers import ShapeSpec
+from detectron2.modeling import (
+    BACKBONE_REGISTRY,
+    RPN_HEAD_REGISTRY,
+    Backbone,
+    build_anchor_generator,
+)
+from detectron2.modeling.backbone.fpn import FPN, LastLevelMaxPool, LastLevelP6P7
+from detectron2.modeling.roi_heads import box_head, keypoint_head, mask_head
+from detectron2.utils.logger import log_first_n
+try:
+    from mobile_cv.arch.fbnet_v2 import fbnet_builder as mbuilder
+    from mobile_cv.arch.utils.helper import format_dict_expanding_list_values
+except Exception as e:
+    pass
+
+
+logger = logging.getLogger(__name__)
+
+FBNET_BUILDER_IDENTIFIER = "fbnetv2"
+
+
+class FBNetV2ModelArch(object):
+    _MODEL_ARCH = {}
+
+    @staticmethod
+    def add(name, arch):
+        assert (
+            name not in FBNetV2ModelArch._MODEL_ARCH
+        ), "Arch name '{}' is already existed".format(name)
+        FBNetV2ModelArch._MODEL_ARCH[name] = arch
+
+    @staticmethod
+    def add_archs(archs):
+        for name, arch in archs.items():
+            FBNetV2ModelArch.add(name, arch)
+
+    @staticmethod
+    def get(name):
+        return copy.deepcopy(FBNetV2ModelArch._MODEL_ARCH[name])
+
+
+def _get_builder_norm_args(cfg):
+    norm_name = cfg.MODEL.FBNET_V2.NORM
+    norm_args = {"name": norm_name}
+    assert all(isinstance(x, dict) for x in cfg.MODEL.FBNET_V2.NORM_ARGS)
+    for dic in cfg.MODEL.FBNET_V2.NORM_ARGS:
+        norm_args.update(dic)
+    return norm_args
+
+
+def _merge_fbnetv2_arch_def(cfg):
+    arch_def = {}
+    assert all(
+        isinstance(x, dict) for x in cfg.MODEL.FBNET_V2.ARCH_DEF
+    ), cfg.MODEL.FBNET_V2.ARCH_DEF
+    for dic in cfg.MODEL.FBNET_V2.ARCH_DEF:
+        arch_def.update(dic)
+    return arch_def
+
+
+def _parse_arch_def(cfg):
+    arch = cfg.MODEL.FBNET_V2.ARCH
+    arch_def = cfg.MODEL.FBNET_V2.ARCH_DEF
+    assert (arch != "" and not arch_def) ^ (
+        not arch and arch_def != []
+    ), "Only allow one unset node between MODEL.FBNET_V2.ARCH ({}) and MODEL.FBNET_V2.ARCH_DEF ({})".format(
+        arch, arch_def
+    )
+    arch_def = FBNetV2ModelArch.get(
+        arch) if arch else _merge_fbnetv2_arch_def(cfg)
+    # NOTE: arch_def is a dictionary describing the CNN architecture for creating
+    # the detection model. It can describe a wide range of models including the
+    # original FBNet. Each key-value pair expresses either a sub part of the model
+    # like trunk or head, or stores other meta information.
+    message = 'Using un-unified arch_def for ARCH "{}" (without scaling):\n{}'.format(
+        arch, format_dict_expanding_list_values(arch_def)
+    )
+    log_first_n(logging.INFO, message, n=1, key="message")
+    return arch_def
+
+
+def _get_fbnet_builder_and_arch_def(cfg):
+    arch_def = _parse_arch_def(cfg)
+
+    # NOTE: one can store extra information in arch_def to configurate FBNetBuilder,
+    # after this point, builder and arch_def will become independent.
+    basic_args = arch_def.pop("basic_args", {})
+
+    builder = mbuilder.FBNetBuilder(
+        width_ratio=cfg.MODEL.FBNET_V2.SCALE_FACTOR,
+        width_divisor=cfg.MODEL.FBNET_V2.WIDTH_DIVISOR,
+        bn_args=_get_builder_norm_args(cfg),
+    )
+    builder.add_basic_args(**basic_args)
+
+    return builder, arch_def
+
+
+def _get_stride_per_stage(blocks):
+    """
+    Count the accummulated stride per stage given a list of blocks. The mbuilder
+    provides API for counting per-block accumulated stride, this function leverages
+    it to count per-stage accumulated stride.
+
+    Input: a list of blocks from the unified arch_def. Note that the stage_idx
+        must be contiguous (not necessarily starting from 0), and can be
+        non-ascending (not tested).
+    Output: a list of accumulated stride per stage, starting from lowest stage_idx.
+    """
+    stride_per_block = mbuilder.count_stride_each_block(blocks)
+
+    assert len(stride_per_block) == len(blocks)
+    stage_idx_set = {s["stage_idx"] for s in blocks}
+    # assume stage idx are contiguous, eg. 1, 2, 3, ...
+    assert max(stage_idx_set) - min(stage_idx_set) + 1 == len(stage_idx_set)
+    start_stage_id = min(stage_idx_set)
+    ids_per_stage = [
+        [i for i, s in enumerate(blocks) if s["stage_idx"] == stage_idx]
+        for stage_idx in range(start_stage_id, start_stage_id + len(stage_idx_set))
+    ]  # eg. [[0], [1, 2], [3, 4, 5, 6], ...]
+    block_stride_per_stage = [
+        [stride_per_block[i] for i in ids] for ids in ids_per_stage
+    ]  # eg. [[1], [2, 1], [2, 1, 1, 1], ...]
+    stride_per_stage = [
+        list(itertools.accumulate(s, lambda x, y: x * y))[-1]
+        for s in block_stride_per_stage
+    ]  # eg. [1, 2, 2, ...]
+    accum_stride_per_stage = list(
+        itertools.accumulate(stride_per_stage, lambda x, y: x * y)
+    )  # eg. [first*1, first*2, first*4, ...]
+
+    assert accum_stride_per_stage[-1] == mbuilder.count_strides(blocks)
+    return accum_stride_per_stage
+
+
+def fbnet_identifier_checker(func):
+    """Can be used to decorate _load_from_state_dict"""
+
+    def wrapper(self, state_dict, prefix, *args, **kwargs):
+        possible_keys = [k for k in state_dict.keys() if k.startswith(prefix)]
+        if not all(FBNET_BUILDER_IDENTIFIER in k for k in possible_keys):
+            logger.warning(
+                "Couldn't match FBNetV2 pattern given prefix {}, possible keys: \n{}".format(
+                    prefix, "\n".join(possible_keys)
+                )
+            )
+            if any("xif" in k for k in possible_keys):
+                raise RuntimeError(
+                    "Seems a FBNetV1 trained checkpoint is loaded by FBNetV2 model,"
+                    " which is not supported. Please consider re-train your model"
+                    " using the same setup as before (it will be FBNetV2). If you"
+                    " need to run the old FBNetV1 models, those configs can be"
+                    " still found, see D19477651 as example."
+                )
+        return func(self, state_dict, prefix, *args, **kwargs)
+
+    return wrapper
+
+
+# pyre-fixme[11]: Annotation `Sequential` is not defined as a type.
+class FBNetModule(nn.Sequential):
+    @fbnet_identifier_checker
+    def _load_from_state_dict(self, *args, **kwargs):
+        return super()._load_from_state_dict(*args, **kwargs)
+
+
+def build_fbnet(cfg, name, in_channels):
+    """
+    Create a FBNet module using FBNet V2 builder.
+    Args:
+        cfg (CfgNode): the config that contains MODEL.FBNET_V2.
+        name (str): the key in arch_def that represents a subpart of network
+        in_channels (int): input channel size
+    Returns:
+        nn.Sequential: the first return is a nn.Sequential, each element
+            corresponds a stage in arch_def.
+        List[ShapeSpec]: the second return is a list of ShapeSpec containing the
+            output channels and accumulated strides for that stage.
+    """
+    builder, raw_arch_def = _get_fbnet_builder_and_arch_def(cfg)
+    # Reset the last_depth for this builder (might have been cached), this is
+    # the only mutable member variable.
+    builder.last_depth = in_channels
+
+    # NOTE: Each sub part of the model consists of several stages and each stage
+    # has several blocks. "Raw" arch_def (Dict[str, List[List[Tuple]]]) uses a
+    # list of stages to describe the architecture, which is more compact and
+    # thus written as builtin metadata (inside FBNetV2ModelArch) or config
+    # (MODEL.FBNET_V2.ARCH_DEF). "Unified" arch_def (Dict[str, List[Dict]])
+    # uses a list blocks from all stages instead, which is recognized by builder.
+    arch_def = mbuilder.unify_arch_def(raw_arch_def, [name])
+    arch_def = {name: arch_def[name]}
+    logger.info(
+        "Build FBNet using unified arch_def:\n{}".format(
+            format_dict_expanding_list_values(arch_def)
+        )
+    )
+    arch_def_blocks = arch_def[name]
+
+    stages = []
+    trunk_stride_per_stage = _get_stride_per_stage(arch_def_blocks)
+    shape_spec_per_stage = []
+    for i, stride_i in enumerate(trunk_stride_per_stage):
+        stages.append(
+            builder.build_blocks(
+                arch_def_blocks,
+                stage_indices=[i],
+                prefix_name=FBNET_BUILDER_IDENTIFIER + "_",
+            )
+        )
+        shape_spec_per_stage.append(
+            ShapeSpec(
+                channels=builder.last_depth,
+                stride=stride_i,
+            )
+        )
+    return FBNetModule(*stages), shape_spec_per_stage
+
+
+class FBNetV2Backbone(Backbone):
+    """
+    Backbone (bottom-up) for FBNet.
+
+    Hierarchy:
+        trunk0:
+            xif0_0
+            xif0_1
+            ...
+        trunk1:
+            xif1_0
+            xif1_1
+            ...
+        ...
+
+    Output features:
+        The outputs from each "stage", i.e. trunkX.
+    """
+
+    def __init__(self, cfg):
+        super(FBNetV2Backbone, self).__init__()
+        stages, shape_specs = build_fbnet(
+            cfg, name="trunk", in_channels=cfg.MODEL.FBNET_V2.STEM_IN_CHANNELS
+        )
+
+        self._trunk_stage_names = []
+        self._trunk_stages = []
+
+        self._out_feature_channels = {}
+        self._out_feature_strides = {}
+        for i, (stage, shape_spec) in enumerate(zip(stages, shape_specs)):
+            name = "trunk{}".format(i)
+            self.add_module(name, stage)
+            self._trunk_stage_names.append(name)
+            self._trunk_stages.append(stage)
+            self._out_feature_channels[name] = shape_spec.channels
+            self._out_feature_strides[name] = shape_spec.stride
+
+        # returned features are the final output of each stage
+        self._out_features = self._trunk_stage_names
+        self._trunk_stage_names = tuple(self._trunk_stage_names)
+
+    def __prepare_scriptable__(self):
+        ret = copy.deepcopy(self)
+        ret._trunk_stages = nn.ModuleList(ret._trunk_stages)
+        for k in self._trunk_stage_names:
+            delattr(ret, k)
+        return ret
+
+    @fbnet_identifier_checker
+    def _load_from_state_dict(self, *args, **kwargs):
+        return super()._load_from_state_dict(*args, **kwargs)
+
+    # return features for each stage
+    def forward(self, x):
+        features = {}
+        for name, stage in zip(self._trunk_stage_names, self._trunk_stages):
+            x = stage(x)
+            features[name] = x
+        return features
+
+
+class FBNetV2FPN(FPN):
+    """
+    FPN module for FBNet.
+    """
+
+    pass
+
+
+def build_fbnet_backbone(cfg):
+    return FBNetV2Backbone(cfg)
+
+
+@BACKBONE_REGISTRY.register()
+class FBNetV2C4Backbone(Backbone):
+    def __init__(self, cfg, _):
+        super(FBNetV2C4Backbone, self).__init__()
+        self.body = build_fbnet_backbone(cfg)
+        self._out_features = self.body._out_features
+        self._out_feature_strides = self.body._out_feature_strides
+        self._out_feature_channels = self.body._out_feature_channels
+
+    def forward(self, x):
+        return self.body(x)
+
+
+@BACKBONE_REGISTRY.register()
+def FBNetV2FpnBackbone(cfg, _):
+    backbone = FBNetV2FPN(
+        bottom_up=build_fbnet_backbone(cfg),
+        in_features=cfg.MODEL.FPN.IN_FEATURES,
+        out_channels=cfg.MODEL.FPN.OUT_CHANNELS,
+        norm=cfg.MODEL.FPN.NORM,
+        top_block=LastLevelMaxPool(),
+    )
+
+    return backbone
+
+
+@BACKBONE_REGISTRY.register()
+def FBNetV2RetinaNetBackbone(cfg, _):
+    bottom_up = build_fbnet_backbone(cfg)
+    in_channels_p6p7 = bottom_up.output_shape(
+    )[cfg.MODEL.FPN.IN_FEATURES[-1]].channels
+    top_block = LastLevelP6P7(in_channels_p6p7, cfg.MODEL.FPN.OUT_CHANNELS)
+    top_block.in_feature = cfg.MODEL.FPN.IN_FEATURES[-1]
+    backbone = FBNetV2FPN(
+        bottom_up=bottom_up,
+        in_features=cfg.MODEL.FPN.IN_FEATURES,
+        out_channels=cfg.MODEL.FPN.OUT_CHANNELS,
+        norm=cfg.MODEL.FPN.NORM,
+        top_block=top_block,
+    )
+
+    return backbone
+
+
+@RPN_HEAD_REGISTRY.register()
+class FBNetV2RpnHead(nn.Module):
+    def __init__(self, cfg, input_shape: List[ShapeSpec]):
+        super(FBNetV2RpnHead, self).__init__()
+
+        in_channels = [x.channels for x in input_shape]
+        assert len(set(in_channels)) == 1
+        in_channels = in_channels[0]
+        anchor_generator = build_anchor_generator(cfg, input_shape)
+        num_cell_anchors = anchor_generator.num_cell_anchors
+        box_dim = anchor_generator.box_dim
+        assert len(set(num_cell_anchors)) == 1
+        num_cell_anchors = num_cell_anchors[0]
+
+        self.rpn_feature, shape_specs = build_fbnet(
+            cfg, name="rpn", in_channels=in_channels
+        )
+        self.rpn_regressor = RPNHeadConvRegressor(
+            in_channels=shape_specs[-1].channels,
+            num_anchors=num_cell_anchors,
+            box_dim=box_dim,
+        )
+
+    def forward(self, x: List[torch.Tensor]):
+        x = [self.rpn_feature(y) for y in x]
+        return self.rpn_regressor(x)
+
+
+@box_head.ROI_BOX_HEAD_REGISTRY.register()
+class FBNetV2RoIBoxHead(nn.Module):
+    def __init__(self, cfg, input_shape: ShapeSpec):
+        super(FBNetV2RoIBoxHead, self).__init__()
+
+        self.roi_box_conv, shape_specs = build_fbnet(
+            cfg, name="bbox", in_channels=input_shape.channels
+        )
+        self._out_channels = shape_specs[-1].channels
+
+        self.avgpool = nn.AdaptiveAvgPool2d(1)
+
+    def forward(self, x):
+        x = self.roi_box_conv(x)
+        if len(x.shape) == 4 and (x.shape[2] > 1 or x.shape[3] > 1):
+            x = self.avgpool(x)
+        return x
+
+    @property
+    @torch.jit.unused
+    def output_shape(self):
+        return ShapeSpec(channels=self._out_channels)
+
+
+@keypoint_head.ROI_KEYPOINT_HEAD_REGISTRY.register()
+class FBNetV2RoIKeypointHead(keypoint_head.BaseKeypointRCNNHead):
+    def __init__(self, cfg, input_shape: ShapeSpec):
+        super(FBNetV2RoIKeypointHead, self).__init__(
+            cfg=cfg,
+            input_shape=input_shape,
+        )
+
+        self.feature_extractor, shape_specs = build_fbnet(
+            cfg, name="kpts", in_channels=input_shape.channels
+        )
+
+        self.predictor = KeypointRCNNPredictor(
+            in_channels=shape_specs[-1].channels,
+            num_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.NUM_KEYPOINTS,
+        )
+
+    def layers(self, x):
+        x = self.feature_extractor(x)
+        x = self.predictor(x)
+        return x
+
+
+@keypoint_head.ROI_KEYPOINT_HEAD_REGISTRY.register()
+class FBNetV2RoIKeypointHeadKRCNNPredictorNoUpscale(keypoint_head.BaseKeypointRCNNHead):
+    def __init__(self, cfg, input_shape: ShapeSpec):
+        super(FBNetV2RoIKeypointHeadKRCNNPredictorNoUpscale, self).__init__(
+            cfg=cfg,
+            input_shape=input_shape,
+        )
+
+        self.feature_extractor, shape_specs = build_fbnet(
+            cfg,
+            name="kpts",
+            in_channels=input_shape.channels,
+        )
+
+        self.predictor = KeypointRCNNPredictorNoUpscale(
+            in_channels=shape_specs[-1].channels,
+            num_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.NUM_KEYPOINTS,
+        )
+
+    def layers(self, x):
+        x = self.feature_extractor(x)
+        x = self.predictor(x)
+        return x
+
+
+@keypoint_head.ROI_KEYPOINT_HEAD_REGISTRY.register()
+class FBNetV2RoIKeypointHeadKPRCNNIRFPredictorNoUpscale(
+    keypoint_head.BaseKeypointRCNNHead,
+):
+    def __init__(self, cfg, input_shape: ShapeSpec):
+        super(FBNetV2RoIKeypointHeadKPRCNNIRFPredictorNoUpscale, self).__init__(
+            cfg=cfg,
+            input_shape=input_shape,
+        )
+
+        self.feature_extractor, shape_specs = build_fbnet(
+            cfg,
+            name="kpts",
+            in_channels=input_shape.channels,
+        )
+
+        self.predictor = KeypointRCNNIRFPredictorNoUpscale(
+            cfg,
+            in_channels=shape_specs[-1].channels,
+            num_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.NUM_KEYPOINTS,
+        )
+
+    def layers(self, x):
+        x = self.feature_extractor(x)
+        x = self.predictor(x)
+        return x
+
+
+@keypoint_head.ROI_KEYPOINT_HEAD_REGISTRY.register()
+class FBNetV2RoIKeypointHeadKPRCNNConvUpsamplePredictorNoUpscale(
+    keypoint_head.BaseKeypointRCNNHead,
+):
+    def __init__(self, cfg, input_shape: ShapeSpec):
+        super(
+            FBNetV2RoIKeypointHeadKPRCNNConvUpsamplePredictorNoUpscale, self
+        ).__init__(
+            cfg=cfg,
+            input_shape=input_shape,
+        )
+
+        self.feature_extractor, shape_specs = build_fbnet(
+            cfg,
+            name="kpts",
+            in_channels=input_shape.channels,
+        )
+
+        self.predictor = KeypointRCNNConvUpsamplePredictorNoUpscale(
+            cfg,
+            in_channels=shape_specs[-1].channels,
+            num_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.NUM_KEYPOINTS,
+        )
+
+    def layers(self, x):
+        x = self.feature_extractor(x)
+        x = self.predictor(x)
+        return x
+
+
+@mask_head.ROI_MASK_HEAD_REGISTRY.register()
+class FBNetV2RoIMaskHead(mask_head.BaseMaskRCNNHead):
+    def __init__(self, cfg, input_shape: ShapeSpec):
+        super(FBNetV2RoIMaskHead, self).__init__(
+            cfg=cfg,
+            input_shape=input_shape,
+        )
+
+        self.feature_extractor, shape_specs = build_fbnet(
+            cfg,
+            name="mask",
+            in_channels=input_shape.channels,
+        )
+
+        num_classes = cfg.MODEL.ROI_HEADS.NUM_CLASSES
+        self.predictor = MaskRCNNConv1x1Predictor(
+            shape_specs[-1].channels, num_classes)
+
+    def layers(self, x):
+        x = self.feature_extractor(x)
+        x = self.predictor(x)
+        return x
diff --git a/yolov7/modeling/backbone/fbnet_v3.py b/yolov7/modeling/backbone/fbnet_v3.py
new file mode 100644
index 0000000..123932d
--- /dev/null
+++ b/yolov7/modeling/backbone/fbnet_v3.py
@@ -0,0 +1,530 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+
+import copy
+
+from .fbnet_v2 import FBNetV2ModelArch
+try:
+    from mobile_cv.arch.fbnet_v2.modeldef_utils import _ex, e1, e2, e1p, e3, e4, e6
+except Exception as e:
+    e1 = None
+    e3 = None
+    e4 = None
+    e6 = None
+
+
+def _mutated_tuple(tp, pos, value):
+    tp_list = list(tp)
+    tp_list[pos] = value
+    return tuple(tp_list)
+
+
+def _repeat_last(stage, n=None):
+    """
+    Repeat the last "layer" of given stage, i.e. a (op_type, c, s, n_repeat, ...)
+        tuple, reset n_repeat if specified otherwise kept the original value.
+    """
+    assert isinstance(stage, list)
+    assert all(isinstance(x, tuple) for x in stage)
+    last_layer = copy.deepcopy(stage[-1])
+    if n is not None:
+        last_layer = _mutated_tuple(last_layer, 3, n)
+    return last_layer
+
+
+_BASIC_ARGS = {
+    # skil norm and activation for depthwise conv in IRF module, this make the
+    # model easier to quantize.
+    "dw_skip_bnrelu": True,
+    # uncomment below (always_pw and bias) to match model definition of the
+    # FBNetV1 builder.
+    # "always_pw": True,
+    # "bias": False,
+    # temporarily disable zero_last_bn_gamma
+    "zero_last_bn_gamma": False,
+}
+
+
+DEFAULT_STAGES = [
+    # NOTE: each stage is a list of (op_type, out_channels, stride, n_repeat, ...)
+    # resolution stage 0, equivalent to 224->112
+    [("conv_k3", 32, 2, 1), ("ir_k3", 16, 1, 1, e1)],
+    # resolution stage 1, equivalent to 112->56
+    [("ir_k3", 24, 2, 2, e6)],
+    # resolution stage 2, equivalent to 56->28
+    [("ir_k3", 32, 2, 3, e6)],
+    # resolution stage 3, equivalent to 28->14
+    [("ir_k3", 64, 2, 4, e6), ("ir_k3", 96, 1, 3, e6)],
+    # resolution stage 4, equivalent to 14->7
+    [("ir_k3", 160, 2, 3, e6), ("ir_k3", 320, 1, 1, e6)],
+    # final stage, equivalent to 7->1, ignored
+]
+
+IRF_CFG = {"less_se_channels": False}
+
+
+FBNetV3_A_dsmask = [
+    [("conv_k3", 16, 2, 1), ("ir_k3", 16, 1, 1, {"expansion": 1}, IRF_CFG)],
+    [
+        ("ir_k5", 32, 2, 1, {"expansion": 4}, IRF_CFG),
+        ("ir_k5", 32, 1, 1, {"expansion": 2}, IRF_CFG),
+    ],
+    [
+        ("ir_k5", 40, 2, 1, {"expansion": 4}, IRF_CFG),
+        ("ir_k3", 40, 1, 3, {"expansion": 3}, IRF_CFG),
+    ],
+    [
+        ("ir_k5", 72, 2, 1, {"expansion": 4}, IRF_CFG),
+        ("ir_k3", 72, 1, 3, {"expansion": 3}, IRF_CFG),
+        ("ir_k5", 112, 1, 1, {"expansion": 4}, IRF_CFG),
+        ("ir_k5", 112, 1, 3, {"expansion": 4}, IRF_CFG),
+    ],
+    [
+        ("ir_k5", 184, 2, 1, {"expansion": 4}, IRF_CFG),
+        ("ir_k3", 184, 1, 4, {"expansion": 4}, IRF_CFG),
+        ("ir_k5", 200, 1, 1, {"expansion": 6}, IRF_CFG),
+    ],
+]
+
+FBNetV3_A_dsmask_tiny = [
+    [("conv_k3", 8, 2, 1), ("ir_k3", 8, 1, 1, {"expansion": 1}, IRF_CFG)],
+    [
+        ("ir_k5", 16, 2, 1, {"expansion": 3}, IRF_CFG),
+        ("ir_k5", 16, 1, 1, {"expansion": 2}, IRF_CFG),
+    ],
+    [
+        ("ir_k5", 24, 2, 1, {"expansion": 4}, IRF_CFG),
+        ("ir_k3", 24, 1, 2, {"expansion": 3}, IRF_CFG),
+    ],
+    [
+        ("ir_k5", 40, 2, 1, {"expansion": 4}, IRF_CFG),
+        ("ir_k3", 40, 1, 2, {"expansion": 3}, IRF_CFG),
+        ("ir_k5", 64, 1, 1, {"expansion": 4}, IRF_CFG),
+        ("ir_k5", 64, 1, 2, {"expansion": 3}, IRF_CFG),
+    ],
+    [
+        ("ir_k5", 92, 2, 1, {"expansion": 4}, IRF_CFG),
+        ("ir_k3", 92, 1, 2, {"expansion": 4}, IRF_CFG),
+        ("ir_k5", 92, 1, 1, {"expansion": 6}, IRF_CFG),
+    ],
+]
+
+FBNetV3_A = [
+    # FBNetV3 arch without hs
+    [("conv_k3", 16, 2, 1), ("ir_k3", 16, 1, 2, {"expansion": 1}, IRF_CFG)],
+    [
+        ("ir_k5", 24, 2, 1, {"expansion": 4}, IRF_CFG),
+        ("ir_k5", 24, 1, 3, {"expansion": 3}, IRF_CFG),
+    ],
+    [
+        ("ir_k5_se", 32, 2, 1, {"expansion": 4}, IRF_CFG),
+        ("ir_k3_se", 32, 1, 3, {"expansion": 3}, IRF_CFG),
+    ],
+    [
+        ("ir_k5", 64, 2, 1, {"expansion": 4}, IRF_CFG),
+        ("ir_k3", 64, 1, 3, {"expansion": 3}, IRF_CFG),
+        ("ir_k5_se", 112, 1, 1, {"expansion": 4}, IRF_CFG),
+        ("ir_k5_se", 112, 1, 5, {"expansion": 3}, IRF_CFG),
+    ],
+    [
+        ("ir_k5_se", 184, 2, 1, {"expansion": 4}, IRF_CFG),
+        ("ir_k3_se", 184, 1, 4, {"expansion": 4}, IRF_CFG),
+        ("ir_k5_se", 200, 1, 1, {"expansion": 6}, IRF_CFG),
+    ],
+]
+
+FBNetV3_B = [
+    [("conv_k3", 16, 2, 1), ("ir_k3", 16, 1, 2, {"expansion": 1}, IRF_CFG)],
+    [
+        ("ir_k5", 24, 2, 1, {"expansion": 4}, IRF_CFG),
+        ("ir_k5", 24, 1, 3, {"expansion": 2}, IRF_CFG),
+    ],
+    [
+        ("ir_k5_se", 40, 2, 1, {"expansion": 5}, IRF_CFG),
+        ("ir_k5_se", 40, 1, 4, {"expansion": 3}, IRF_CFG),
+    ],
+    [
+        ("ir_k5", 72, 2, 1, {"expansion": 5}, IRF_CFG),
+        ("ir_k3", 72, 1, 4, {"expansion": 3}, IRF_CFG),
+        ("ir_k3_se", 120, 1, 1, {"expansion": 5}, IRF_CFG),
+        ("ir_k5_se", 120, 1, 5, {"expansion": 3}, IRF_CFG),
+    ],
+    [
+        ("ir_k3_se", 184, 2, 1, {"expansion": 6}, IRF_CFG),
+        ("ir_k5_se", 184, 1, 5, {"expansion": 4}, IRF_CFG),
+        ("ir_k5_se", 224, 1, 1, {"expansion": 6}, IRF_CFG),
+    ],
+]
+
+FBNetV3_C = [
+    [("conv_k3", 16, 2, 1), ("ir_k3", 16, 1, 2, {"expansion": 1}, IRF_CFG)],
+    [
+        ("ir_k5", 24, 2, 1, {"expansion": 5}, IRF_CFG),
+        ("ir_k3", 24, 1, 4, {"expansion": 3}, IRF_CFG),
+    ],
+    [
+        ("ir_k5_se", 48, 2, 1, {"expansion": 5}, IRF_CFG),
+        ("ir_k5_se", 48, 1, 4, {"expansion": 2}, IRF_CFG),
+    ],
+    [
+        ("ir_k5", 88, 2, 1, {"expansion": 4}, IRF_CFG),
+        ("ir_k3", 88, 1, 4, {"expansion": 3}, IRF_CFG),
+        ("ir_k3_se", 120, 1, 1, {"expansion": 4}, IRF_CFG),
+        ("ir_k5_se", 120, 1, 5, {"expansion": 3}, IRF_CFG),
+    ],
+    [
+        ("ir_k5_se", 216, 2, 1, {"expansion": 5}, IRF_CFG),
+        ("ir_k5_se", 216, 1, 5, {"expansion": 5}, IRF_CFG),
+        ("ir_k5_se", 216, 1, 1, {"expansion": 6}, IRF_CFG),
+    ],
+]
+
+FBNetV3_D = [
+    [("conv_k3", 24, 2, 1), ("ir_k3", 16, 1, 2, {"expansion": 1}, IRF_CFG)],
+    [
+        ("ir_k3", 24, 2, 1, {"expansion": 5}, IRF_CFG),
+        ("ir_k3", 24, 1, 5, {"expansion": 2}, IRF_CFG),
+    ],
+    [
+        ("ir_k5_se", 40, 2, 1, {"expansion": 4}, IRF_CFG),
+        ("ir_k3_se", 40, 1, 4, {"expansion": 3}, IRF_CFG),
+    ],
+    [
+        ("ir_k3", 72, 2, 1, {"expansion": 5}, IRF_CFG),
+        ("ir_k3", 72, 1, 4, {"expansion": 3}, IRF_CFG),
+        ("ir_k3_se", 128, 1, 1, {"expansion": 5}, IRF_CFG),
+        ("ir_k5_se", 128, 1, 6, {"expansion": 3}, IRF_CFG),
+    ],
+    [
+        ("ir_k3_se", 208, 2, 1, {"expansion": 6}, IRF_CFG),
+        ("ir_k5_se", 208, 1, 5, {"expansion": 5}, IRF_CFG),
+        ("ir_k5_se", 240, 1, 1, {"expansion": 6}, IRF_CFG),
+    ],
+]
+
+FBNetV3_E = [
+    [("conv_k3", 24, 2, 1), ("ir_k3", 16, 1, 3, {"expansion": 1}, IRF_CFG)],
+    [
+        ("ir_k5", 24, 2, 1, {"expansion": 4}, IRF_CFG),
+        ("ir_k5", 24, 1, 4, {"expansion": 2}, IRF_CFG),
+    ],
+    [
+        ("ir_k5_se", 48, 2, 1, {"expansion": 4}, IRF_CFG),
+        ("ir_k5_se", 48, 1, 4, {"expansion": 3}, IRF_CFG),
+    ],
+    [
+        ("ir_k5", 80, 2, 1, {"expansion": 5}, IRF_CFG),
+        ("ir_k3", 80, 1, 4, {"expansion": 3}, IRF_CFG),
+        ("ir_k3_se", 128, 1, 1, {"expansion": 5}, IRF_CFG),
+        ("ir_k5_se", 128, 1, 7, {"expansion": 3}, IRF_CFG),
+    ],
+    [
+        ("ir_k3_se", 216, 2, 1, {"expansion": 6}, IRF_CFG),
+        ("ir_k5_se", 216, 1, 5, {"expansion": 5}, IRF_CFG),
+        ("ir_k5_se", 240, 1, 1, {"expansion": 6}, IRF_CFG),
+    ],
+]
+
+FBNetV3_F = [
+    [("conv_k3", 24, 2, 1), ("ir_k3", 24, 1, 3, {"expansion": 1}, IRF_CFG)],
+    [
+        ("ir_k5", 32, 2, 1, {"expansion": 4}, IRF_CFG),
+        ("ir_k5", 32, 1, 4, {"expansion": 2}, IRF_CFG),
+    ],
+    [
+        ("ir_k5_se", 56, 2, 1, {"expansion": 4}, IRF_CFG),
+        ("ir_k5_se", 56, 1, 4, {"expansion": 3}, IRF_CFG),
+    ],
+    [
+        ("ir_k5", 88, 2, 1, {"expansion": 5}, IRF_CFG),
+        ("ir_k3", 88, 1, 4, {"expansion": 3}, IRF_CFG),
+        ("ir_k3_se", 144, 1, 1, {"expansion": 5}, IRF_CFG),
+        ("ir_k5_se", 144, 1, 8, {"expansion": 3}, IRF_CFG),
+    ],
+    [
+        ("ir_k3_se", 248, 2, 1, {"expansion": 6}, IRF_CFG),
+        ("ir_k5_se", 248, 1, 6, {"expansion": 5}, IRF_CFG),
+        ("ir_k5_se", 272, 1, 1, {"expansion": 6}, IRF_CFG),
+    ],
+]
+
+FBNetV3_G = [
+    [("conv_k3", 32, 2, 1), ("ir_k3", 24, 1, 3, {"expansion": 1}, IRF_CFG)],
+    [
+        ("ir_k5", 40, 2, 1, {"expansion": 4}, IRF_CFG),
+        ("ir_k5", 40, 1, 4, {"expansion": 2}, IRF_CFG),
+    ],
+    [
+        ("ir_k5_se", 56, 2, 1, {"expansion": 4}, IRF_CFG),
+        ("ir_k5_se", 56, 1, 4, {"expansion": 3}, IRF_CFG),
+    ],
+    [
+        ("ir_k5", 104, 2, 1, {"expansion": 5}, IRF_CFG),
+        ("ir_k3", 104, 1, 4, {"expansion": 3}, IRF_CFG),
+        ("ir_k3_se", 160, 1, 1, {"expansion": 5}, IRF_CFG),
+        ("ir_k5_se", 160, 1, 8, {"expansion": 3}, IRF_CFG),
+    ],
+    [
+        ("ir_k3_se", 264, 2, 1, {"expansion": 6}, IRF_CFG),
+        ("ir_k5_se", 264, 1, 6, {"expansion": 5}, IRF_CFG),
+        ("ir_k5_se", 288, 1, 2, {"expansion": 6}, IRF_CFG),
+    ],
+]
+
+FBNetV3_H = [
+    [("conv_k3", 48, 2, 1), ("ir_k3", 32, 1, 4, {"expansion": 1}, IRF_CFG)],
+    [
+        ("ir_k5", 64, 2, 1, {"expansion": 4}, IRF_CFG),
+        ("ir_k5", 64, 1, 6, {"expansion": 2}, IRF_CFG),
+    ],
+    [
+        ("ir_k5_se", 80, 2, 1, {"expansion": 4}, IRF_CFG),
+        ("ir_k5_se", 80, 1, 6, {"expansion": 3}, IRF_CFG),
+    ],
+    [
+        ("ir_k5", 160, 2, 1, {"expansion": 5}, IRF_CFG),
+        ("ir_k3", 160, 1, 6, {"expansion": 3}, IRF_CFG),
+        ("ir_k3_se", 240, 1, 1, {"expansion": 5}, IRF_CFG),
+        ("ir_k5_se", 240, 1, 12, {"expansion": 3}, IRF_CFG),
+    ],
+    [
+        ("ir_k3_se", 400, 2, 1, {"expansion": 6}, IRF_CFG),
+        ("ir_k5_se", 400, 1, 8, {"expansion": 5}, IRF_CFG),
+        ("ir_k5_se", 480, 1, 3, {"expansion": 6}, IRF_CFG),
+    ],
+]
+
+FBNetV3_A_no_se = [
+    # FBNetV3 without hs and SE (SE is not quantization friendly)
+    [("conv_k3", 16, 2, 1), ("ir_k3", 16, 1, 2, {"expansion": 1}, IRF_CFG)],
+    [
+        ("ir_k5", 24, 2, 1, {"expansion": 4}, IRF_CFG),
+        ("ir_k5", 24, 1, 3, {"expansion": 3}, IRF_CFG),
+    ],
+    [
+        ("ir_k5", 32, 2, 1, {"expansion": 4}, IRF_CFG),
+        ("ir_k3", 32, 1, 3, {"expansion": 3}, IRF_CFG),
+    ],
+    [
+        ("ir_k5", 64, 2, 1, {"expansion": 4}, IRF_CFG),
+        ("ir_k3", 64, 1, 3, {"expansion": 3}, IRF_CFG),
+        ("ir_k5", 112, 1, 1, {"expansion": 4}, IRF_CFG),
+        ("ir_k5", 112, 1, 5, {"expansion": 3}, IRF_CFG),
+    ],
+    [
+        ("ir_k5", 184, 2, 1, {"expansion": 4}, IRF_CFG),
+        ("ir_k3", 184, 1, 4, {"expansion": 4}, IRF_CFG),
+        ("ir_k5", 200, 1, 1, {"expansion": 6}, IRF_CFG),
+    ],
+]
+
+FBNetV3_B_no_se = [
+    [("conv_k3", 16, 2, 1), ("ir_k3", 16, 1, 2, {"expansion": 1}, IRF_CFG)],
+    [
+        ("ir_k5", 24, 2, 1, {"expansion": 4}, IRF_CFG),
+        ("ir_k5", 24, 1, 3, {"expansion": 2}, IRF_CFG),
+    ],
+    [
+        ("ir_k5", 40, 2, 1, {"expansion": 5}, IRF_CFG),
+        ("ir_k5", 40, 1, 4, {"expansion": 3}, IRF_CFG),
+    ],
+    [
+        ("ir_k5", 72, 2, 1, {"expansion": 5}, IRF_CFG),
+        ("ir_k3", 72, 1, 4, {"expansion": 3}, IRF_CFG),
+        ("ir_k3", 120, 1, 1, {"expansion": 5}, IRF_CFG),
+        ("ir_k5", 120, 1, 5, {"expansion": 3}, IRF_CFG),
+    ],
+    [
+        ("ir_k3", 184, 2, 1, {"expansion": 6}, IRF_CFG),
+        ("ir_k5", 184, 1, 5, {"expansion": 4}, IRF_CFG),
+        ("ir_k5", 224, 1, 1, {"expansion": 6}, IRF_CFG),
+    ],
+]
+
+
+# FBNetV3_B model, a lighter version for real-time inference
+FBNetV3_B_light_no_se = [
+    [("conv_k3", 16, 2, 1), ("ir_k3", 16, 1, 2, {"expansion": 1}, IRF_CFG)],
+    [
+        ("ir_k5", 24, 2, 1, {"expansion": 4}, IRF_CFG),
+        ("ir_k5", 24, 1, 2, {"expansion": 2}, IRF_CFG),
+    ],
+    [
+        ("ir_k5", 40, 2, 1, {"expansion": 5}, IRF_CFG),
+        ("ir_k5", 40, 1, 3, {"expansion": 3}, IRF_CFG),
+    ],
+    [
+        ("ir_k5", 72, 2, 1, {"expansion": 5}, IRF_CFG),
+        ("ir_k3", 72, 1, 4, {"expansion": 3}, IRF_CFG),
+        ("ir_k3", 120, 1, 1, {"expansion": 5}, IRF_CFG),
+        ("ir_k5", 120, 1, 5, {"expansion": 3}, IRF_CFG),
+    ],
+    [
+        ("ir_k3", 184, 2, 1, {"expansion": 6}, IRF_CFG),
+        ("ir_k5", 184, 1, 5, {"expansion": 4}, IRF_CFG),
+        ("ir_k5", 224, 1, 1, {"expansion": 6}, IRF_CFG),
+    ],
+]
+
+
+LARGE_BOX_HEAD_STAGES = [
+    [("ir_k3", 160, 2, 1, e4), ("ir_k3", 160, 1, 2, e6), ("ir_k3", 240, 1, 1, e6)],
+]
+
+SMALL_BOX_HEAD_STAGES = [
+    [("ir_k3", 128, 2, 1, e4), ("ir_k3", 128, 1, 2, e6), ("ir_k3", 160, 1, 1, e6)],
+]
+
+TINY_BOX_HEAD_STAGES = [
+    [("ir_k3", 64, 2, 1, e4), ("ir_k3", 64, 1, 2, e4), ("ir_k3", 80, 1, 1, e4)],
+]
+
+LARGE_UPSAMPLE_HEAD_STAGES = [
+    [("ir_k3", 160, 1, 1, e4), ("ir_k3", 160, 1, 3, e6), ("ir_k3", 80, -2, 1, e3)],
+]
+
+LARGE_UPSAMPLE_HEAD_D21_STAGES = [
+    [("ir_k3", 192, 1, 1, e4), ("ir_k3", 192, 1, 5, e3), ("ir_k3", 96, -2, 1, e3)],
+]
+
+SMALL_UPSAMPLE_HEAD_STAGES = [
+    [("ir_k3", 128, 1, 1, e4), ("ir_k3", 128, 1, 3, e6), ("ir_k3", 64, -2, 1, e3)],
+]
+
+
+# NOTE: Compared with SMALL_UPSAMPLE_HEAD_STAGES, this does one more down-sample
+# in the first "layer" and then up-sample twice
+SMALL_DS_UPSAMPLE_HEAD_STAGES = [
+    [
+        ("ir_k3", 128, 2, 1, e4),
+        ("ir_k3", 128, 1, 2, e6),
+        ("ir_k3", 128, -2, 1, e6),
+        ("ir_k3", 64, -2, 1, e3),
+    ],  # noqa
+]
+
+TINY_DS_UPSAMPLE_HEAD_STAGES = [
+    [
+        ("ir_k3", 64, 2, 1, e4),
+        ("ir_k3", 64, 1, 2, e4),
+        ("ir_k3", 64, -2, 1, e4),
+        ("ir_k3", 40, -2, 1, e3),
+    ],  # noqa
+]
+
+FPN_UPSAMPLE_HEAD_STAGES = [
+    [("ir_k3", 96, 1, 1, e6), ("ir_k3", 160, 1, 3, e6), ("ir_k3", 80, -2, 1, e3)],
+]
+
+MODEL_ARCH_BUILTIN = {
+    "default": {
+        "trunk": DEFAULT_STAGES[0:4],
+        "rpn": [[_repeat_last(DEFAULT_STAGES[3])]],
+        "bbox": LARGE_BOX_HEAD_STAGES,
+        "mask": LARGE_UPSAMPLE_HEAD_STAGES,
+        "kpts": LARGE_UPSAMPLE_HEAD_STAGES,
+        "basic_args": _BASIC_ARGS,
+    },
+    "default_dsmask": {
+        "trunk": DEFAULT_STAGES[0:4],
+        "rpn": [[_repeat_last(DEFAULT_STAGES[3])]],
+        "bbox": SMALL_BOX_HEAD_STAGES,
+        "mask": SMALL_DS_UPSAMPLE_HEAD_STAGES,
+        "kpts": SMALL_DS_UPSAMPLE_HEAD_STAGES,
+        "basic_args": _BASIC_ARGS,
+    },
+    "FBNetV3_A": {
+        "trunk": FBNetV3_A[0:4],
+        "rpn": [[_repeat_last(FBNetV3_A[3])]],
+        "bbox": [FBNetV3_A[4]],
+        "mask": SMALL_DS_UPSAMPLE_HEAD_STAGES,
+        "basic_args": _BASIC_ARGS,
+    },
+    "FBNetV3_B": {
+        "trunk": FBNetV3_B[0:4],
+        "rpn": [[_repeat_last(FBNetV3_B[3])]],
+        "bbox": [FBNetV3_B[4]],
+        "basic_args": _BASIC_ARGS,
+    },
+    "FBNetV3_C": {
+        "trunk": FBNetV3_C[0:4],
+        "rpn": [[_repeat_last(FBNetV3_C[3])]],
+        "bbox": [FBNetV3_C[4]],
+        "basic_args": _BASIC_ARGS,
+    },
+    "FBNetV3_D": {
+        "trunk": FBNetV3_D[0:4],
+        "rpn": [[_repeat_last(FBNetV3_D[3])]],
+        "bbox": [FBNetV3_D[4]],
+        "basic_args": _BASIC_ARGS,
+    },
+    "FBNetV3_E": {
+        "trunk": FBNetV3_E[0:4],
+        "rpn": [[_repeat_last(FBNetV3_E[3])]],
+        "bbox": [FBNetV3_E[4]],
+        "basic_args": _BASIC_ARGS,
+    },
+    "FBNetV3_F": {
+        "trunk": FBNetV3_F[0:4],
+        "rpn": [[_repeat_last(FBNetV3_F[3])]],
+        "bbox": [FBNetV3_F[4]],
+        "basic_args": _BASIC_ARGS,
+    },
+    "FBNetV3_G": {
+        "trunk": FBNetV3_G[0:4],
+        "rpn": [[_repeat_last(FBNetV3_G[3])]],
+        "bbox": [FBNetV3_G[4]],
+        "mask": LARGE_UPSAMPLE_HEAD_STAGES,
+        "kpts": LARGE_UPSAMPLE_HEAD_D21_STAGES,
+        "basic_args": _BASIC_ARGS,
+    },
+    "FBNetV3_H": {
+        "trunk": FBNetV3_H[0:4],
+        "rpn": [[_repeat_last(FBNetV3_H[3])]],
+        "bbox": [FBNetV3_H[4]],
+        "basic_args": _BASIC_ARGS,
+    },
+    "FBNetV3_A_dsmask_C5": {
+        "trunk": FBNetV3_A_dsmask,
+        "rpn": [[_repeat_last(FBNetV3_A_dsmask[3])]],
+        "bbox": SMALL_BOX_HEAD_STAGES,
+        "mask": SMALL_DS_UPSAMPLE_HEAD_STAGES,
+        "kpts": SMALL_DS_UPSAMPLE_HEAD_STAGES,
+        "basic_args": _BASIC_ARGS,
+    },
+    "FBNetV3_A_dsmask": {
+        "trunk": FBNetV3_A_dsmask[0:4],
+        "rpn": [[_repeat_last(FBNetV3_A_dsmask[3])]],
+        "bbox": SMALL_BOX_HEAD_STAGES,
+        "mask": SMALL_DS_UPSAMPLE_HEAD_STAGES,
+        "kpts": SMALL_DS_UPSAMPLE_HEAD_STAGES,
+        "basic_args": _BASIC_ARGS,
+    },
+    "FBNetV3_A_dsmask_tiny": {
+        "trunk": FBNetV3_A_dsmask_tiny[0:4],
+        "rpn": [[_repeat_last(FBNetV3_A_dsmask_tiny[3])]],
+        "bbox": TINY_BOX_HEAD_STAGES,
+        "mask": TINY_DS_UPSAMPLE_HEAD_STAGES,
+        "kpts": TINY_DS_UPSAMPLE_HEAD_STAGES,
+        "basic_args": _BASIC_ARGS,
+    },
+    "FBNetV3_B_light_large": {
+        "trunk": FBNetV3_B_light_no_se[0:4],
+        "rpn": [[_repeat_last(FBNetV3_B_light_no_se[3])]],
+        "bbox": SMALL_BOX_HEAD_STAGES,
+        "mask": SMALL_DS_UPSAMPLE_HEAD_STAGES,
+        "kpts": LARGE_UPSAMPLE_HEAD_D21_STAGES,
+        "basic_args": _BASIC_ARGS,
+    },
+    "FBNetV3_G_fpn": {
+        "trunk": FBNetV3_G[0:5],  # FPN uses all 5 stages
+        "rpn": [[_repeat_last(FBNetV3_G[3], n=1)]],
+        "bbox": [FBNetV3_G[4]],
+        "mask": FPN_UPSAMPLE_HEAD_STAGES,
+        "kpts": LARGE_UPSAMPLE_HEAD_D21_STAGES,
+        "basic_args": _BASIC_ARGS,
+    },
+}
+
+FBNetV2ModelArch.add_archs(MODEL_ARCH_BUILTIN)
diff --git a/yolov7/modeling/backbone/layers/__init__.py b/yolov7/modeling/backbone/layers/__init__.py
old mode 100755
new mode 100644
diff --git a/yolov7/modeling/backbone/layers/activations.py b/yolov7/modeling/backbone/layers/activations.py
old mode 100755
new mode 100644
diff --git a/yolov7/modeling/backbone/layers/row_column_decoupled_attention.py b/yolov7/modeling/backbone/layers/row_column_decoupled_attention.py
new file mode 100644
index 0000000..72d7534
--- /dev/null
+++ b/yolov7/modeling/backbone/layers/row_column_decoupled_attention.py
@@ -0,0 +1,433 @@
+# ------------------------------------------------------------------------
+# Copyright (c) 2021 megvii-model. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Modified from nn.MultiheadAttention
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# ------------------------------------------------------------------------
+import warnings
+import torch
+from torch.nn.functional import  linear,softmax,dropout,pad
+from torch.nn import Linear
+from torch.nn.init import xavier_uniform_
+from torch.nn.init import constant_
+from torch.nn.init import xavier_normal_
+from torch.nn.parameter import Parameter
+from torch.nn.modules import Module
+from torch.nn import functional as F
+
+
+import torch
+
+from torch.nn import grad  # noqa: F401
+
+from torch._jit_internal import boolean_dispatch, List, Optional, _overload
+from torch.overrides import has_torch_function, handle_torch_function
+
+
+Tensor = torch.Tensor
+
+
+def multi_head_rcda_forward(query_row,  # type: Tensor
+                            query_col,  # type: Tensor
+                            key_row,  # type: Tensor
+                            key_col,  # type: Tensor
+                            value,  # type: Tensor
+                            embed_dim_to_check,  # type: int
+                            num_heads,  # type: int
+                            in_proj_weight,  # type: Tensor
+                            in_proj_bias,  # type: Tensor
+                            bias_k_row,  # type: Optional[Tensor]
+                            bias_k_col,  # type: Optional[Tensor]
+                            bias_v,  # type: Optional[Tensor]
+                            add_zero_attn,  # type: bool
+                            dropout_p,  # type: float
+                            out_proj_weight,  # type: Tensor
+                            out_proj_bias,  # type: Tensor
+                            training=True,  # type: bool
+                            key_padding_mask=None,  # type: Optional[Tensor]
+                            need_weights=True,  # type: bool
+                            attn_mask=None,  # type: Optional[Tensor]
+                            use_separate_proj_weight=False,  # type: bool
+                            q_row_proj_weight=None,  # type: Optional[Tensor]
+                            q_col_proj_weight=None,  # type: Optional[Tensor]
+                            k_row_proj_weight=None,  # type: Optional[Tensor]
+                            k_col_proj_weight=None,  # type: Optional[Tensor]
+                            v_proj_weight=None,  # type: Optional[Tensor]
+                            static_k=None,  # type: Optional[Tensor]
+                            static_v=None  # type: Optional[Tensor]
+                            ):
+    # type: (...) -> Tuple[Tensor, Optional[Tensor]]
+    r"""
+    Args:
+        query_row, query_col, key_row, key_col, value: map a query and a set of key-value pairs to an output.
+            See "Anchor DETR: Query Design for Transformer-Based Detector" for more details.
+        embed_dim_to_check: total dimension of the model.
+        num_heads: parallel attention heads.
+        in_proj_weight, in_proj_bias: input projection weight and bias.
+        bias_k, bias_v: bias of the key and value sequences to be added at dim=0.
+        add_zero_attn: add a new batch of zeros to the key and
+                       value sequences at dim=1.
+        dropout_p: probability of an element to be zeroed.
+        out_proj_weight, out_proj_bias: the output projection weight and bias.
+        training: apply dropout if is ``True``.
+        key_padding_mask: if provided, specified padding elements in the key will
+            be ignored by the attention. This is an binary mask. When the value is True,
+            the corresponding value on the attention layer will be filled with -inf.
+        need_weights: output attn_output_weights.
+        attn_mask: 2D or 3D mask that prevents attention to certain positions. This is an additive mask
+            (i.e. the values will be added to the attention layer). A 2D mask will be broadcasted for all
+            the batches while a 3D mask allows to specify a different mask for the entries of each batch.
+        use_separate_proj_weight: the function accept the proj. weights for query, key,
+            and value in different forms. If false, in_proj_weight will be used, which is
+            a combination of q_row_proj_weight, q_col_proj_weight, k_row_proj_weight, k_col_proj_weight, v_proj_weight.
+        q_row_proj_weight, q_col_proj_weight, k_row_proj_weight, k_col_proj_weight, v_proj_weight, in_proj_bias: input projection weight and bias.
+        static_k, static_v: static key and value used for attention operators.
+
+
+    Shape:
+        Inputs:
+        - query_row: :math:`(N, L, E)` where L is the target sequence length, N is the batch size, E is
+          the embedding dimension.
+        - query_col: :math:`(N, L, E)` where L is the target sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key_row: :math:`(N, H, W, E)`, where W is the source sequence row length, N is the batch size, E is
+          the embedding dimension.
+        - key_col: :math:`(N, H, W, E)`, where H is the source sequence column length, N is the batch size, E is
+          the embedding dimension.
+        - value: :math:`(N, H, W, E)` where HW is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key_padding_mask: :math:`(N, H, W)`, ByteTensor, where N is the batch size, HW is the source sequence length.
+        - attn_mask: Not Implemented
+        - static_k: Not Implemented
+        - static_v: Not Implemented
+
+        Outputs:
+        - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
+          E is the embedding dimension.
+        - attn_output_weights: :math:`(N, L, HW)` where N is the batch size,
+          L is the target sequence length, HW is the source sequence length.
+    """
+    if not torch.jit.is_scripting():
+        tens_ops = (query_row,query_col, key_row, key_col, value, in_proj_weight, in_proj_bias, bias_k_row,bias_k_col, bias_v,
+                    out_proj_weight, out_proj_bias)
+        if any([type(t) is not Tensor for t in tens_ops]) and has_torch_function(tens_ops):
+            return handle_torch_function(
+                multi_head_rcda_forward, tens_ops, query_row,query_col, key_row, key_col, value,
+                embed_dim_to_check, num_heads, in_proj_weight, in_proj_bias,
+                bias_k_row,bias_k_col, bias_v, add_zero_attn, dropout_p, out_proj_weight,
+                out_proj_bias, training=training, key_padding_mask=key_padding_mask,
+                need_weights=need_weights, attn_mask=attn_mask,
+                use_separate_proj_weight=use_separate_proj_weight,
+                q_row_proj_weight=q_row_proj_weight, q_col_proj_weight=q_col_proj_weight,
+                k_row_proj_weight=k_row_proj_weight, k_col_proj_weight=k_col_proj_weight,
+                v_proj_weight=v_proj_weight, static_k=static_k, static_v=static_v)
+
+
+    bsz, tgt_len, embed_dim = query_row.size()
+    src_len_row = key_row.size()[2]
+    src_len_col = key_col.size()[1]
+
+
+    assert embed_dim == embed_dim_to_check
+    # assert key.size() == value.size()
+
+    head_dim = embed_dim // num_heads
+    assert head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+    scaling = float(head_dim) ** -0.5
+
+
+    # This is inline in_proj function with in_proj_weight and in_proj_bias
+    _b = in_proj_bias
+    _start = 0
+    _end = embed_dim
+    _w = in_proj_weight[_start:_end, :]
+    if _b is not None:
+        _b = _b[_start:_end]
+    q_row = linear(query_row, _w, _b)
+
+    # This is inline in_proj function with in_proj_weight and in_proj_bias
+    _b = in_proj_bias
+    _start = embed_dim * 1
+    _end = embed_dim * 2
+    _w = in_proj_weight[_start:_end, :]
+    if _b is not None:
+        _b = _b[_start:_end]
+    q_col = linear(query_col, _w, _b)
+
+    # This is inline in_proj function with in_proj_weight and in_proj_bias
+    _b = in_proj_bias
+    _start = embed_dim * 2
+    _end = embed_dim * 3
+    _w = in_proj_weight[_start:_end, :]
+    if _b is not None:
+        _b = _b[_start:_end]
+    k_row = linear(key_row, _w, _b)
+
+    # This is inline in_proj function with in_proj_weight and in_proj_bias
+    _b = in_proj_bias
+    _start = embed_dim * 3
+    _end = embed_dim * 4
+    _w = in_proj_weight[_start:_end, :]
+    if _b is not None:
+        _b = _b[_start:_end]
+    k_col = linear(key_col, _w, _b)
+
+    # This is inline in_proj function with in_proj_weight and in_proj_bias
+    _b = in_proj_bias
+    _start = embed_dim * 4
+    _end = None
+    _w = in_proj_weight[_start:, :]
+    if _b is not None:
+        _b = _b[_start:]
+    v = linear(value, _w, _b)
+
+    q_row = q_row.transpose(0, 1)
+    q_col = q_col.transpose(0, 1)
+    k_row = k_row.mean(1).transpose(0, 1)
+    k_col = k_col.mean(2).transpose(0, 1)
+
+    q_row = q_row * scaling
+    q_col = q_col * scaling
+
+
+    q_row = q_row.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
+    q_col = q_col.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
+
+    if k_row is not None:
+        k_row = k_row.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+    if k_col is not None:
+        k_col = k_col.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+    if v is not None:
+        v = v.contiguous().permute(1,2,0,3).reshape(src_len_col,src_len_row, bsz*num_heads, head_dim).permute(2,0,1,3)
+
+
+    attn_output_weights_row = torch.bmm(q_row, k_row.transpose(1, 2))
+    attn_output_weights_col = torch.bmm(q_col, k_col.transpose(1, 2))
+    assert list(attn_output_weights_row.size()) == [bsz * num_heads, tgt_len, src_len_row]
+    assert list(attn_output_weights_col.size()) == [bsz * num_heads, tgt_len, src_len_col]
+
+
+    if key_padding_mask is not None:
+        mask_row=key_padding_mask[:,0,:].unsqueeze(1).unsqueeze(2)
+        mask_col=key_padding_mask[:,:,0].unsqueeze(1).unsqueeze(2)
+
+        attn_output_weights_row = attn_output_weights_row.view(bsz, num_heads, tgt_len, src_len_row)
+        attn_output_weights_col = attn_output_weights_col.view(bsz, num_heads, tgt_len, src_len_col)
+
+        attn_output_weights_row = attn_output_weights_row.masked_fill(mask_row,float('-inf'))
+        attn_output_weights_col = attn_output_weights_col.masked_fill(mask_col, float('-inf'))
+
+        attn_output_weights_row = attn_output_weights_row.view(bsz * num_heads, tgt_len, src_len_row)
+        attn_output_weights_col = attn_output_weights_col.view(bsz * num_heads, tgt_len, src_len_col)
+
+    attn_output_weights_col = softmax(attn_output_weights_col, dim=-1)
+    attn_output_weights_row = softmax(attn_output_weights_row, dim=-1)
+
+    attn_output_weights_col = dropout(attn_output_weights_col, p=dropout_p, training=training)
+    attn_output_weights_row = dropout(attn_output_weights_row, p=dropout_p, training=training)
+
+    efficient_compute=True
+    # This config will not affect the performance.
+    # It will compute the short edge first which can save the memory and run slightly faster but both of them should get the same results.
+    # You can also set it "False" if your graph needs to be always the same.
+    if efficient_compute:
+        if src_len_col<src_len_row:
+            b_ein,q_ein,w_ein = attn_output_weights_row.shape
+            b_ein,h_ein,w_ein,c_ein = v.shape
+            attn_output_row = torch.matmul(attn_output_weights_row,v.permute(0,2,1,3).reshape(b_ein,w_ein,h_ein*c_ein)).reshape(b_ein,q_ein,h_ein,c_ein).permute(0,2,1,3)
+            attn_output = torch.matmul(attn_output_weights_col.permute(1,0,2)[:,:,None,:],attn_output_row.permute(2,0,1,3)).squeeze(-2).reshape(tgt_len,bsz,embed_dim)
+            ### the following code base on einsum get the same results
+            # attn_output_row = torch.einsum("bqw,bhwc->bhqc",attn_output_weights_row,v)
+            # attn_output = torch.einsum("bqh,bhqc->qbc",attn_output_weights_col,attn_output_row).reshape(tgt_len,bsz,embed_dim)
+        else:
+            b_ein,q_ein,h_ein=attn_output_weights_col.shape
+            b_ein,h_ein,w_ein,c_ein = v.shape
+            attn_output_col = torch.matmul(attn_output_weights_col,v.reshape(b_ein,h_ein,w_ein*c_ein)).reshape(b_ein,q_ein,w_ein,c_ein)
+            attn_output = torch.matmul(attn_output_weights_row[:,:,None,:],attn_output_col).squeeze(-2).permute(1,0,2).reshape(tgt_len, bsz, embed_dim)
+            ### the following code base on einsum get the same results
+            # attn_output_col = torch.einsum("bqh,bhwc->bqwc", attn_output_weights_col, v)
+            # attn_output = torch.einsum("bqw,bqwc->qbc", attn_output_weights_row, attn_output_col).reshape(tgt_len, bsz,embed_dim)
+    else:
+        b_ein, q_ein, h_ein = attn_output_weights_col.shape
+        b_ein, h_ein, w_ein, c_ein = v.shape
+        attn_output_col = torch.matmul(attn_output_weights_col, v.reshape(b_ein, h_ein, w_ein * c_ein)).reshape(b_ein, q_ein, w_ein, c_ein)
+        attn_output = torch.matmul(attn_output_weights_row[:, :, None, :], attn_output_col).squeeze(-2).permute(1, 0, 2).reshape(tgt_len, bsz, embed_dim)
+        ### the following code base on einsum get the same results
+        # attn_output_col = torch.einsum("bqh,bhwc->bqwc", attn_output_weights_col, v)
+        # attn_output = torch.einsum("bqw,bqwc->qbc", attn_output_weights_row, attn_output_col).reshape(tgt_len, bsz,embed_dim)
+
+    attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
+
+    if need_weights:
+        return attn_output,torch.einsum("bqw,bqh->qbhw",attn_output_weights_row,attn_output_weights_col).reshape(tgt_len,bsz,num_heads,src_len_col,src_len_row).mean(2)
+    else:
+        return attn_output, None
+
+
+
+class MultiheadRCDA(Module):
+    r"""Allows the model to jointly attend to information
+    from different representation subspaces.
+    See reference:
+        Anchor DETR: Query Design for Transformer-Based Detector
+
+    .. math::
+        \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
+        \text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)
+
+    Args:
+        embed_dim: total dimension of the model.
+        num_heads: parallel attention heads.
+        dropout: a Dropout layer on attn_output_weights. Default: 0.0.
+        bias: add bias as module parameter. Default: True.
+        add_bias_kv: add bias to the key and value sequences at dim=0.
+        add_zero_attn: add a new batch of zeros to the key and
+                       value sequences at dim=1.
+        kdim: total number of features in key. Default: None.
+        vdim: total number of features in key. Default: None.
+
+        Note: if kdim and vdim are None, they will be set to embed_dim such that
+        query, key, and value have the same number of features.
+
+    Examples::
+        >>> multihead_attn = MultiheadRCDA(embed_dim, num_heads)
+        >>> attn_output, attn_output_weights = multihead_attn(query_row, query_col, key_row, key_col, value)
+    """
+    __annotations__ = {
+        'bias_k_row': torch._jit_internal.Optional[torch.Tensor],
+        'bias_k_col': torch._jit_internal.Optional[torch.Tensor],
+        'bias_v': torch._jit_internal.Optional[torch.Tensor],
+    }
+    __constants__ = ['q_row_proj_weight', 'q_col_proj_weight', 'k_row_proj_weight', 'k_col_proj_weight', 'v_proj_weight', 'in_proj_weight']
+
+    def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False, kdim=None, vdim=None):
+        super(MultiheadRCDA, self).__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
+
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+
+        if self._qkv_same_embed_dim is False:
+            self.q_row_proj_weight = Parameter(torch.Tensor(embed_dim, embed_dim))
+            self.q_col_proj_weight = Parameter(torch.Tensor(embed_dim, embed_dim))
+            self.k_row_proj_weight = Parameter(torch.Tensor(embed_dim, self.kdim))
+            self.k_col_proj_weight = Parameter(torch.Tensor(embed_dim, self.kdim))
+            self.v_proj_weight = Parameter(torch.Tensor(embed_dim, self.vdim))
+            self.register_parameter('in_proj_weight', None)
+        else:
+            self.in_proj_weight = Parameter(torch.empty(5 * embed_dim, embed_dim))
+            self.register_parameter('q_row_proj_weight', None)
+            self.register_parameter('q_col_proj_weight', None)
+            self.register_parameter('k_row_proj_weight', None)
+            self.register_parameter('k_col_proj_weight', None)
+            self.register_parameter('v_proj_weight', None)
+
+        if bias:
+            self.in_proj_bias = Parameter(torch.empty(5 * embed_dim))
+        else:
+            self.register_parameter('in_proj_bias', None)
+        self.out_proj = Linear(embed_dim, embed_dim, bias=bias)
+
+        if add_bias_kv:
+            self.bias_k_row = Parameter(torch.empty(1, 1, embed_dim))
+            self.bias_k_col = Parameter(torch.empty(1, 1, embed_dim))
+            self.bias_v = Parameter(torch.empty(1, 1, embed_dim))
+        else:
+            self.bias_k_row = self.bias_k_col = self.bias_v = None
+
+        self.add_zero_attn = add_zero_attn
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        if self._qkv_same_embed_dim:
+            xavier_uniform_(self.in_proj_weight)
+        else:
+            xavier_uniform_(self.q_row_proj_weight)
+            xavier_uniform_(self.q_col_proj_weight)
+            xavier_uniform_(self.k_row_proj_weight)
+            xavier_uniform_(self.k_col_proj_weight)
+            xavier_uniform_(self.v_proj_weight)
+
+        if self.in_proj_bias is not None:
+            constant_(self.in_proj_bias, 0.)
+            constant_(self.out_proj.bias, 0.)
+        if self.bias_k_row is not None:
+            xavier_normal_(self.bias_k_row)
+        if self.bias_k_col is not None:
+            xavier_normal_(self.bias_k_col)
+        if self.bias_v is not None:
+            xavier_normal_(self.bias_v)
+
+    def __setstate__(self, state):
+        # Support loading old MultiheadAttention checkpoints generated by v1.1.0
+        if '_qkv_same_embed_dim' not in state:
+            state['_qkv_same_embed_dim'] = True
+
+        super(MultiheadRCDA, self).__setstate__(state)
+
+    def forward(self, query_row, query_col, key_row, key_col, value,
+                key_padding_mask=None, need_weights=False, attn_mask=None):
+        # type: (Tensor, Tensor, Tensor, Tensor, Tensor, Optional[Tensor], bool, Optional[Tensor]) -> Tuple[Tensor, Optional[Tensor]]
+        r"""
+    Args:
+        query_row, query_col, key_row, key_col, value: map a query and a set of key-value pairs to an output.
+            See "Anchor DETR: Query Design for Transformer-Based Detector" for more details.
+        key_padding_mask: if provided, specified padding elements in the key will
+            be ignored by the attention. This is an binary mask. When the value is True,
+            the corresponding value on the attention layer will be filled with -inf.
+        need_weights: output attn_output_weights.
+        attn_mask: 2D or 3D mask that prevents attention to certain positions. This is an additive mask
+            (i.e. the values will be added to the attention layer). A 2D mask will be broadcasted for all
+            the batches while a 3D mask allows to specify a different mask for the entries of each batch.
+
+    Shape:
+        - Inputs:
+        - query_row: :math:`(N, L, E)` where L is the target sequence length, N is the batch size, E is
+          the embedding dimension.
+        - query_col: :math:`(N, L, E)` where L is the target sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key_row: :math:`(N, H, W, E)`, where W is the source sequence row length, N is the batch size, E is
+          the embedding dimension.
+        - key_col: :math:`(N, H, W, E)`, where H is the source sequence column length, N is the batch size, E is
+          the embedding dimension.
+        - value: :math:`(N, H, W, E)` where HW is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key_padding_mask: :math:`(N, H, W)`, ByteTensor, where N is the batch size, HW is the source sequence length.
+        - attn_mask: Not Implemented
+        - static_k: Not Implemented
+        - static_v: Not Implemented
+
+        - Outputs:
+        - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
+          E is the embedding dimension.
+        - attn_output_weights: :math:`(N, L, HW)` where N is the batch size,
+          L is the target sequence length, HW is the source sequence length.
+        """
+        if not self._qkv_same_embed_dim:
+            return multi_head_rcda_forward(
+                query_row,query_col, key_row, key_col, value, self.embed_dim, self.num_heads,
+                self.in_proj_weight, self.in_proj_bias,
+                self.bias_k_row,self.bias_k_col, self.bias_v, self.add_zero_attn,
+                self.dropout, self.out_proj.weight, self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask, need_weights=need_weights,
+                attn_mask=attn_mask, use_separate_proj_weight=True,
+                q_row_proj_weight=self.q_row_proj_weight, q_col_proj_weight=self.q_col_proj_weight,
+                k_row_proj_weight=self.k_row_proj_weight, k_col_proj_weight=self.k_col_proj_weight,
+                v_proj_weight=self.v_proj_weight)
+        else:
+            return multi_head_rcda_forward(
+                query_row,query_col, key_row,key_col, value, self.embed_dim, self.num_heads,
+                self.in_proj_weight, self.in_proj_bias,
+                self.bias_k_row,self.bias_k_col, self.bias_v, self.add_zero_attn,
+                self.dropout, self.out_proj.weight, self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask, need_weights=need_weights,
+                attn_mask=attn_mask)
+
diff --git a/yolov7/modeling/backbone/layers/smca_attention.py b/yolov7/modeling/backbone/layers/smca_attention.py
new file mode 100644
index 0000000..754969d
--- /dev/null
+++ b/yolov7/modeling/backbone/layers/smca_attention.py
@@ -0,0 +1,365 @@
+import torch
+from torch.nn.functional import linear, pad
+from torch import Tensor
+from torch.nn import MultiheadAttention
+
+from typing import Optional, Tuple, List
+import warnings
+
+
+def multi_head_attention_forward(
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    embed_dim_to_check: int,
+    num_heads: int,
+    in_proj_weight: Tensor,
+    in_proj_bias: Tensor,
+    bias_k: Optional[Tensor],
+    bias_v: Optional[Tensor],
+    add_zero_attn: bool,
+    dropout_p: float,
+    out_proj_weight: Tensor,
+    out_proj_bias: Tensor,
+    training: bool = True,
+    key_padding_mask: Optional[Tensor] = None,
+    need_weights: bool = True,
+    attn_mask: Optional[Tensor] = None,
+    use_separate_proj_weight: bool = False,
+    q_proj_weight: Optional[Tensor] = None,
+    k_proj_weight: Optional[Tensor] = None,
+    v_proj_weight: Optional[Tensor] = None,
+    static_k: Optional[Tensor] = None,
+    static_v: Optional[Tensor] = None,
+    gaussian: Optional[Tensor] = None,
+) -> Tuple[Tensor, Optional[Tensor]]:
+    r"""
+    Args:
+        query, key, value: map a query and a set of key-value pairs to an output.
+            See "Attention Is All You Need" for more details.
+        embed_dim_to_check: total dimension of the model.
+        num_heads: parallel attention heads.
+        in_proj_weight, in_proj_bias: input projection weight and bias.
+        bias_k, bias_v: bias of the key and value sequences to be added at dim=0.
+        add_zero_attn: add a new batch of zeros to the key and
+                       value sequences at dim=1.
+        dropout_p: probability of an element to be zeroed.
+        out_proj_weight, out_proj_bias: the output projection weight and bias.
+        training: apply dropout if is ``True``.
+        key_padding_mask: if provided, specified padding elements in the key will
+            be ignored by the attention. This is an binary mask. When the value is True,
+            the corresponding value on the attention layer will be filled with -inf.
+        need_weights: output attn_output_weights.
+        attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
+            the batches while a 3D mask allows to specify a different mask for the entries of each batch.
+        use_separate_proj_weight: the function accept the proj. weights for query, key,
+            and value in different forms. If false, in_proj_weight will be used, which is
+            a combination of q_proj_weight, k_proj_weight, v_proj_weight.
+        q_proj_weight, k_proj_weight, v_proj_weight: input projection weight and bias.
+        static_k, static_v: static key and value used for attention operators.
+        gaussian: the generated Gaussian-like weight map
+    Shape:
+        Inputs:
+        - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
+          If a ByteTensor is provided, the non-zero positions will be ignored while the zero positions
+          will be unchanged. If a BoolTensor is provided, the positions with the
+          value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
+        - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
+          3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length,
+          S is the source sequence length. attn_mask ensures that position i is allowed to attend the unmasked
+          positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
+          while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
+          are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
+          is provided, it will be added to the attention weight.
+        - static_k: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
+          N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
+        - static_v: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
+          N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
+        Outputs:
+        - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
+          E is the embedding dimension.
+        - attn_output_weights: :math:`(N, L, S)` where N is the batch size,
+          L is the target sequence length, S is the source sequence length.
+    """
+    tgt_len, bsz, embed_dim = query.size()
+    assert embed_dim == embed_dim_to_check
+    # allow MHA to have different sizes for the feature dimension
+    assert key.size(0) == value.size(0) and key.size(1) == value.size(1)
+
+    head_dim = embed_dim // num_heads
+    assert head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+    scaling = float(head_dim) ** -0.5
+
+    if not use_separate_proj_weight:
+        if torch.equal(query, key) and torch.equal(key, value):
+            # self-attention
+            q, k, v = linear(query, in_proj_weight,
+                             in_proj_bias).chunk(3, dim=-1)
+
+        elif torch.equal(key, value):
+            # encoder-decoder attention
+            # This is inline in_proj function with in_proj_weight and in_proj_bias
+            _b = in_proj_bias
+            _start = 0
+            _end = embed_dim
+            _w = in_proj_weight[_start:_end, :]
+            if _b is not None:
+                _b = _b[_start:_end]
+            q = linear(query, _w, _b)
+
+            if key is None:
+                assert value is None
+                k = None
+                v = None
+            else:
+                # This is inline in_proj function with in_proj_weight and in_proj_bias
+                _b = in_proj_bias
+                _start = embed_dim
+                _end = None
+                _w = in_proj_weight[_start:, :]
+                if _b is not None:
+                    _b = _b[_start:]
+                k, v = linear(key, _w, _b).chunk(2, dim=-1)
+        else:
+            # This is inline in_proj function with in_proj_weight and in_proj_bias
+            _b = in_proj_bias
+            _start = 0
+            _end = embed_dim
+            _w = in_proj_weight[_start:_end, :]
+            if _b is not None:
+                _b = _b[_start:_end]
+            q = linear(query, _w, _b)
+
+            # This is inline in_proj function with in_proj_weight and in_proj_bias
+            _b = in_proj_bias
+            _start = embed_dim
+            _end = embed_dim * 2
+            _w = in_proj_weight[_start:_end, :]
+            if _b is not None:
+                _b = _b[_start:_end]
+            k = linear(key, _w, _b)
+
+            # This is inline in_proj function with in_proj_weight and in_proj_bias
+            _b = in_proj_bias
+            _start = embed_dim * 2
+            _end = None
+            _w = in_proj_weight[_start:, :]
+            if _b is not None:
+                _b = _b[_start:]
+            v = linear(value, _w, _b)
+    else:
+        q_proj_weight_non_opt = torch.jit._unwrap_optional(q_proj_weight)
+        len1, len2 = q_proj_weight_non_opt.size()
+        assert len1 == embed_dim and len2 == query.size(-1)
+
+        k_proj_weight_non_opt = torch.jit._unwrap_optional(k_proj_weight)
+        len1, len2 = k_proj_weight_non_opt.size()
+        assert len1 == embed_dim and len2 == key.size(-1)
+
+        v_proj_weight_non_opt = torch.jit._unwrap_optional(v_proj_weight)
+        len1, len2 = v_proj_weight_non_opt.size()
+        assert len1 == embed_dim and len2 == value.size(-1)
+
+        if in_proj_bias is not None:
+            q = linear(query, q_proj_weight_non_opt, in_proj_bias[0:embed_dim])
+            k = linear(key, k_proj_weight_non_opt,
+                       in_proj_bias[embed_dim:(embed_dim * 2)])
+            v = linear(value, v_proj_weight_non_opt,
+                       in_proj_bias[(embed_dim * 2):])
+        else:
+            q = linear(query, q_proj_weight_non_opt, in_proj_bias)
+            k = linear(key, k_proj_weight_non_opt, in_proj_bias)
+            v = linear(value, v_proj_weight_non_opt, in_proj_bias)
+    q = q * scaling
+
+    if attn_mask is not None:
+        assert attn_mask.dtype == torch.float32 or attn_mask.dtype == torch.float64 or \
+            attn_mask.dtype == torch.float16 or attn_mask.dtype == torch.uint8 or attn_mask.dtype == torch.bool, \
+            'Only float, byte, and bool types are supported for attn_mask, not {}'.format(
+                attn_mask.dtype)
+        if attn_mask.dtype == torch.uint8:
+            warnings.warn(
+                "Byte tensor for attn_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.")
+            attn_mask = attn_mask.to(torch.bool)
+
+        if attn_mask.dim() == 2:
+            attn_mask = attn_mask.unsqueeze(0)
+            if list(attn_mask.size()) != [1, query.size(0), key.size(0)]:
+                raise RuntimeError(
+                    'The size of the 2D attn_mask is not correct.')
+        elif attn_mask.dim() == 3:
+            if list(attn_mask.size()) != [bsz * num_heads, query.size(0), key.size(0)]:
+                raise RuntimeError(
+                    'The size of the 3D attn_mask is not correct.')
+        else:
+            raise RuntimeError(
+                "attn_mask's dimension {} is not supported".format(attn_mask.dim()))
+        # attn_mask's dim is 3 now.
+
+    # convert ByteTensor key_padding_mask to bool
+    if key_padding_mask is not None and key_padding_mask.dtype == torch.uint8:
+        warnings.warn(
+            "Byte tensor for key_padding_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.")
+        key_padding_mask = key_padding_mask.to(torch.bool)
+
+    if bias_k is not None and bias_v is not None:
+        if static_k is None and static_v is None:
+            k = torch.cat([k, bias_k.repeat(1, bsz, 1)])
+            v = torch.cat([v, bias_v.repeat(1, bsz, 1)])
+            if attn_mask is not None:
+                attn_mask = pad(attn_mask, (0, 1))
+            if key_padding_mask is not None:
+                key_padding_mask = pad(key_padding_mask, (0, 1))
+        else:
+            assert static_k is None, "bias cannot be added to static key."
+            assert static_v is None, "bias cannot be added to static value."
+    else:
+        assert bias_k is None
+        assert bias_v is None
+
+    q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
+    if k is not None:
+        k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+    if v is not None:
+        v = v.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+
+    if static_k is not None:
+        assert static_k.size(0) == bsz * num_heads
+        assert static_k.size(2) == head_dim
+        k = static_k
+
+    if static_v is not None:
+        assert static_v.size(0) == bsz * num_heads
+        assert static_v.size(2) == head_dim
+        v = static_v
+
+    src_len = k.size(1)
+
+    if key_padding_mask is not None:
+        assert key_padding_mask.size(0) == bsz
+        assert key_padding_mask.size(1) == src_len
+
+    if add_zero_attn:
+        src_len += 1
+        k = torch.cat([k, torch.zeros((k.size(0), 1) + k.size()
+                      [2:], dtype=k.dtype, device=k.device)], dim=1)
+        v = torch.cat([v, torch.zeros((v.size(0), 1) + v.size()
+                      [2:], dtype=v.dtype, device=v.device)], dim=1)
+        if attn_mask is not None:
+            attn_mask = pad(attn_mask, (0, 1))
+        if key_padding_mask is not None:
+            key_padding_mask = pad(key_padding_mask, (0, 1))
+    naive = True
+    if naive:
+        attn_output_weights = torch.bmm(q, k.transpose(1, 2))
+        assert list(attn_output_weights.size()) == [
+            bsz * num_heads, tgt_len, src_len]
+
+        if attn_mask is not None:
+            if attn_mask.dtype == torch.bool:
+                attn_output_weights.masked_fill_(attn_mask, float('-inf'))
+            else:
+                attn_output_weights += attn_mask
+
+        if key_padding_mask is not None:
+            attn_output_weights = attn_output_weights.view(
+                bsz, num_heads, tgt_len, src_len)
+            attn_output_weights = attn_output_weights.masked_fill(
+                key_padding_mask.unsqueeze(1).unsqueeze(2), float('-inf'),)
+            attn_output_weights = attn_output_weights.view(
+                bsz * num_heads, tgt_len, src_len)
+        attn_output_weights = attn_output_weights + \
+            gaussian[0].permute(2, 0, 1)
+        attn_output_weights = torch.nn.functional.softmax(
+            attn_output_weights, dim=-1)
+        attn_output_weights = torch.nn.functional.dropout(attn_output_weights, p=dropout_p,
+                                                          training=training)
+
+        attn_output = torch.bmm(attn_output_weights, v)
+        assert list(attn_output.size()) == [bsz * num_heads, tgt_len, head_dim]
+
+    attn_output = attn_output.transpose(
+        0, 1).contiguous().view(tgt_len, bsz, embed_dim)
+    attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
+
+    return attn_output, attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
+
+
+class GaussianMultiheadAttention(MultiheadAttention):
+    def __init__(self, embed_dim, num_heads, **kwargs):
+        super(GaussianMultiheadAttention, self).__init__(
+            embed_dim, num_heads, **kwargs)
+        self.gaussian = True
+
+    def forward(self, query, key, value, key_padding_mask=None,
+                need_weights=False, attn_mask=None, gaussian=None):
+        # type: (Tensor, Tensor, Tensor, Optional[Tensor], bool, Optional[Tensor], Optional[Tensor]) -> Tuple[Tensor, Optional[Tensor]]
+        r"""
+    Args:
+        query, key, value: map a query and a set of key-value pairs to an output.
+            See "Attention Is All You Need" for more details.
+        key_padding_mask: if provided, specified padding elements in the key will
+            be ignored by the attention. When given a binary mask and a value is True,
+            the corresponding value on the attention layer will be ignored. When given
+            a byte mask and a value is non-zero, the corresponding value on the attention
+            layer will be ignored
+        need_weights: output attn_output_weights.
+        attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
+            the batches while a 3D mask allows to specify a different mask for the entries of each batch.
+        gaussian: 2D gaussian attention map that focus attention to certain object queries' initial estimations
+            with handcrafted query spatial priors.
+
+    Shape:
+        - Inputs:
+        - query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key_padding_mask: :math:`(N, S)` where N is the batch size, S is the source sequence length.
+          If a ByteTensor is provided, the non-zero positions will be ignored while the position
+          with the zero positions will be unchanged. If a BoolTensor is provided, the positions with the
+          value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
+        - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
+          3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length,
+          S is the source sequence length. attn_mask ensure that position i is allowed to attend the unmasked
+          positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend
+          while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True``
+          is not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
+          is provided, it will be added to the attention weight.
+        - gaussian: :math:`(L, S, nhead * batch_size)`, where nhead is the number of head in multi-head
+          attention module, L is the target sequence length, S is the source sequence length.
+
+        - Outputs:
+        - attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
+          E is the embedding dimension.
+        - attn_output_weights: :math:`(N, L, S)` where N is the batch size,
+          L is the target sequence length, S is the source sequence length.
+        """
+        if not self._qkv_same_embed_dim:
+            return multi_head_attention_forward(
+                query, key, value, self.embed_dim, self.num_heads,
+                self.in_proj_weight, self.in_proj_bias,
+                self.bias_k, self.bias_v, self.add_zero_attn,
+                self.dropout, self.out_proj.weight, self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask, need_weights=need_weights,
+                attn_mask=attn_mask, use_separate_proj_weight=True,
+                q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight,
+                v_proj_weight=self.v_proj_weight, gaussian=gaussian)
+        else:
+            return multi_head_attention_forward(
+                query, key, value, self.embed_dim, self.num_heads,
+                self.in_proj_weight, self.in_proj_bias,
+                self.bias_k, self.bias_v, self.add_zero_attn,
+                self.dropout, self.out_proj.weight, self.out_proj.bias,
+                training=self.training,
+                key_padding_mask=key_padding_mask, need_weights=need_weights,
+                attn_mask=attn_mask, gaussian=gaussian)
diff --git a/yolov7/modeling/backbone/layers/utils.py b/yolov7/modeling/backbone/layers/utils.py
old mode 100755
new mode 100644
diff --git a/yolov7/modeling/backbone/layers/wrappers.py b/yolov7/modeling/backbone/layers/wrappers.py
old mode 100755
new mode 100644
diff --git a/yolov7/modeling/backbone/mobilevit.py b/yolov7/modeling/backbone/mobilevit.py
new file mode 100644
index 0000000..89e56b1
--- /dev/null
+++ b/yolov7/modeling/backbone/mobilevit.py
@@ -0,0 +1,538 @@
+#
+# For licensing see accompanying LICENSE file.
+# Copyright (C) 2020 Apple Inc. All Rights Reserved.
+#
+
+from torch import nn
+import argparse
+from typing import Dict, Tuple, Optional
+
+from utils import logger
+
+from .config.mobilevit import get_configuration
+from ...layers import ConvLayer, LinearLayer, GlobalPool, Dropout, SeparableConv
+from ...modules import InvertedResidual, MobileViTBlock
+
+
+from torch import nn, Tensor
+from typing import Optional, Dict
+import argparse
+
+from utils import logger
+
+from ... import parameter_list
+from ...layers import norm_layers_tuple
+from ...misc.profiler import module_profile
+from ...misc.init_utils import initialize_weights
+
+
+class BaseEncoder(nn.Module):
+    def __init__(self, *args, **kwargs):
+        super(BaseEncoder, self).__init__()
+        self.conv_1 = None
+        self.layer_1 = None
+        self.layer_2 = None
+        self.layer_3 = None
+        self.layer_4 = None
+        self.layer_5 = None
+        self.conv_1x1_exp = None
+        self.classifier = None
+        self.round_nearest = 8
+
+        self.model_conf_dict = dict()
+
+    @classmethod
+    def add_arguments(cls, parser: argparse.ArgumentParser):
+        return parser
+
+    def check_model(self):
+        assert (
+            self.model_conf_dict
+        ), "Model configuration dictionary should not be empty"
+        assert self.conv_1 is not None, "Please implement self.conv_1"
+        assert self.layer_1 is not None, "Please implement self.layer_1"
+        assert self.layer_2 is not None, "Please implement self.layer_2"
+        assert self.layer_3 is not None, "Please implement self.layer_3"
+        assert self.layer_4 is not None, "Please implement self.layer_4"
+        assert self.layer_5 is not None, "Please implement self.layer_5"
+        assert self.conv_1x1_exp is not None, "Please implement self.conv_1x1_exp"
+        assert self.classifier is not None, "Please implement self.classifier"
+
+    def reset_parameters(self, opts):
+        initialize_weights(opts=opts, modules=self.modules())
+
+    def extract_end_points_all(
+        self,
+        x: Tensor,
+        use_l5: Optional[bool] = True,
+        use_l5_exp: Optional[bool] = False,
+    ) -> Dict:
+        out_dict = {}  # Use dictionary over NamedTuple so that JIT is happy
+        x = self.conv_1(x)  # 112 x112
+        x = self.layer_1(x)  # 112 x112
+        out_dict["out_l1"] = x
+
+        x = self.layer_2(x)  # 56 x 56
+        out_dict["out_l2"] = x
+
+        x = self.layer_3(x)  # 28 x 28
+        out_dict["out_l3"] = x
+
+        x = self.layer_4(x)  # 14 x 14
+        out_dict["out_l4"] = x
+
+        if use_l5:
+            x = self.layer_5(x)  # 7 x 7
+            out_dict["out_l5"] = x
+
+            if use_l5_exp:
+                x = self.conv_1x1_exp(x)
+                out_dict["out_l5_exp"] = x
+        return out_dict
+
+    def extract_end_points_l4(self, x: Tensor) -> Dict:
+        return self.extract_end_points_all(x, use_l5=False)
+
+    def extract_features(self, x: Tensor) -> Tensor:
+        x = self.conv_1(x)
+        x = self.layer_1(x)
+        x = self.layer_2(x)
+        x = self.layer_3(x)
+
+        x = self.layer_4(x)
+        x = self.layer_5(x)
+        x = self.conv_1x1_exp(x)
+        return x
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.extract_features(x)
+        x = self.classifier(x)
+        return x
+
+    def freeze_norm_layers(self):
+        for m in self.modules():
+            if isinstance(m, norm_layers_tuple):
+                m.eval()
+                m.weight.requires_grad = False
+                m.bias.requires_grad = False
+                m.training = False
+
+    def get_trainable_parameters(
+        self, weight_decay: float = 0.0, no_decay_bn_filter_bias: bool = False
+    ):
+        param_list = parameter_list(
+            named_parameters=self.named_parameters,
+            weight_decay=weight_decay,
+            no_decay_bn_filter_bias=no_decay_bn_filter_bias,
+        )
+        return param_list, [1.0] * len(param_list)
+
+    @staticmethod
+    def _profile_layers(layers, input, overall_params, overall_macs):
+        if not isinstance(layers, list):
+            layers = [layers]
+
+        for layer in layers:
+            if layer is None:
+                continue
+            input, layer_param, layer_macs = module_profile(module=layer, x=input)
+
+            overall_params += layer_param
+            overall_macs += layer_macs
+
+            if isinstance(layer, nn.Sequential):
+                module_name = "\n+".join([l.__class__.__name__ for l in layer])
+            else:
+                module_name = layer.__class__.__name__
+            print(
+                "{:<15} \t {:<5}: {:>8.3f} M \t {:<5}: {:>8.3f} M".format(
+                    module_name,
+                    "Params",
+                    round(layer_param / 1e6, 3),
+                    "MACs",
+                    round(layer_macs / 1e6, 3),
+                )
+            )
+            logger.singe_dash_line()
+        return input, overall_params, overall_macs
+
+    def profile_model(
+        self, input: Tensor, is_classification: bool = True
+    ) -> (Tensor or Dict[Tensor], float, float):
+        # Note: Model profiling is for reference only and may contain errors.
+        # It relies heavily on the user to implement the underlying functions accurately.
+        overall_params, overall_macs = 0.0, 0.0
+
+        if is_classification:
+            logger.log("Model statistics for an input of size {}".format(input.size()))
+            logger.double_dash_line(dashes=65)
+            print("{:>35} Summary".format(self.__class__.__name__))
+            logger.double_dash_line(dashes=65)
+
+        out_dict = {}
+        input, overall_params, overall_macs = self._profile_layers(
+            [self.conv_1, self.layer_1],
+            input=input,
+            overall_params=overall_params,
+            overall_macs=overall_macs,
+        )
+        out_dict["out_l1"] = input
+
+        input, overall_params, overall_macs = self._profile_layers(
+            self.layer_2,
+            input=input,
+            overall_params=overall_params,
+            overall_macs=overall_macs,
+        )
+        out_dict["out_l2"] = input
+
+        input, overall_params, overall_macs = self._profile_layers(
+            self.layer_3,
+            input=input,
+            overall_params=overall_params,
+            overall_macs=overall_macs,
+        )
+        out_dict["out_l3"] = input
+
+        input, overall_params, overall_macs = self._profile_layers(
+            self.layer_4,
+            input=input,
+            overall_params=overall_params,
+            overall_macs=overall_macs,
+        )
+        out_dict["out_l4"] = input
+
+        input, overall_params, overall_macs = self._profile_layers(
+            self.layer_5,
+            input=input,
+            overall_params=overall_params,
+            overall_macs=overall_macs,
+        )
+        out_dict["out_l5"] = input
+
+        if self.conv_1x1_exp is not None:
+            input, overall_params, overall_macs = self._profile_layers(
+                self.conv_1x1_exp,
+                input=input,
+                overall_params=overall_params,
+                overall_macs=overall_macs,
+            )
+            out_dict["out_l5_exp"] = input
+
+        if is_classification:
+            classifier_params, classifier_macs = 0.0, 0.0
+            if self.classifier is not None:
+                input, classifier_params, classifier_macs = module_profile(
+                    module=self.classifier, x=input
+                )
+                print(
+                    "{:<15} \t {:<5}: {:>8.3f} M \t {:<5}: {:>8.3f} M".format(
+                        "Classifier",
+                        "Params",
+                        round(classifier_params / 1e6, 3),
+                        "MACs",
+                        round(classifier_macs / 1e6, 3),
+                    )
+                )
+            overall_params += classifier_params
+            overall_macs += classifier_macs
+
+            logger.double_dash_line(dashes=65)
+            print(
+                "{:<20} = {:>8.3f} M".format("Overall parameters", overall_params / 1e6)
+            )
+            # Counting Addition and Multiplication as 1 operation
+            print("{:<20} = {:>8.3f} M".format("Overall MACs", overall_macs / 1e6))
+            overall_params_py = sum([p.numel() for p in self.parameters()])
+            print(
+                "{:<20} = {:>8.3f} M".format(
+                    "Overall parameters (sanity check)", overall_params_py / 1e6
+                )
+            )
+            logger.double_dash_line(dashes=65)
+
+        return out_dict, overall_params, overall_macs
+
+
+class MobileViT(BaseEncoder):
+    """
+    MobileViT: https://arxiv.org/abs/2110.02178?context=cs.LG
+    """
+
+    def __init__(self, opts, *args, **kwargs) -> None:
+        num_classes = getattr(opts, "model.classification.n_classes", 1000)
+        classifier_dropout = getattr(
+            opts, "model.classification.classifier_dropout", 0.2
+        )
+
+        pool_type = getattr(opts, "model.layer.global_pool", "mean")
+        image_channels = 3
+        out_channels = 16
+
+        mobilevit_config = get_configuration(opts=opts)
+
+        # Segmentation architectures like Deeplab and PSPNet modifies the strides of the classification backbones
+        # We allow that using `output_stride` arguments
+        output_stride = kwargs.get("output_stride", None)
+        dilate_l4 = dilate_l5 = False
+        if output_stride == 8:
+            dilate_l4 = True
+            dilate_l5 = True
+        elif output_stride == 16:
+            dilate_l5 = True
+
+        super(MobileViT, self).__init__()
+        self.dilation = 1
+
+        # store model configuration in a dictionary
+        self.model_conf_dict = dict()
+        self.conv_1 = ConvLayer(
+            opts=opts,
+            in_channels=image_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=2,
+            use_norm=True,
+            use_act=True,
+        )
+
+        self.model_conf_dict["conv1"] = {"in": image_channels, "out": out_channels}
+
+        in_channels = out_channels
+        self.layer_1, out_channels = self._make_layer(
+            opts=opts, input_channel=in_channels, cfg=mobilevit_config["layer1"]
+        )
+        self.model_conf_dict["layer1"] = {"in": in_channels, "out": out_channels}
+
+        in_channels = out_channels
+        self.layer_2, out_channels = self._make_layer(
+            opts=opts, input_channel=in_channels, cfg=mobilevit_config["layer2"]
+        )
+        self.model_conf_dict["layer2"] = {"in": in_channels, "out": out_channels}
+
+        in_channels = out_channels
+        self.layer_3, out_channels = self._make_layer(
+            opts=opts, input_channel=in_channels, cfg=mobilevit_config["layer3"]
+        )
+        self.model_conf_dict["layer3"] = {"in": in_channels, "out": out_channels}
+
+        in_channels = out_channels
+        self.layer_4, out_channels = self._make_layer(
+            opts=opts,
+            input_channel=in_channels,
+            cfg=mobilevit_config["layer4"],
+            dilate=dilate_l4,
+        )
+        self.model_conf_dict["layer4"] = {"in": in_channels, "out": out_channels}
+
+        in_channels = out_channels
+        self.layer_5, out_channels = self._make_layer(
+            opts=opts,
+            input_channel=in_channels,
+            cfg=mobilevit_config["layer5"],
+            dilate=dilate_l5,
+        )
+        self.model_conf_dict["layer5"] = {"in": in_channels, "out": out_channels}
+
+        in_channels = out_channels
+        exp_channels = min(mobilevit_config["last_layer_exp_factor"] * in_channels, 960)
+        self.conv_1x1_exp = ConvLayer(
+            opts=opts,
+            in_channels=in_channels,
+            out_channels=exp_channels,
+            kernel_size=1,
+            stride=1,
+            use_act=True,
+            use_norm=True,
+        )
+
+        self.model_conf_dict["exp_before_cls"] = {
+            "in": in_channels,
+            "out": exp_channels,
+        }
+
+        self.classifier = nn.Sequential()
+        self.classifier.add_module(
+            name="global_pool", module=GlobalPool(pool_type=pool_type, keep_dim=False)
+        )
+        if 0.0 < classifier_dropout < 1.0:
+            self.classifier.add_module(
+                name="dropout", module=Dropout(p=classifier_dropout, inplace=True)
+            )
+        self.classifier.add_module(
+            name="fc",
+            module=LinearLayer(
+                in_features=exp_channels, out_features=num_classes, bias=True
+            ),
+        )
+
+        # check model
+        self.check_model()
+
+        # weight initialization
+        self.reset_parameters(opts=opts)
+
+    @classmethod
+    def add_arguments(cls, parser: argparse.ArgumentParser):
+        group = parser.add_argument_group(
+            title="".format(cls.__name__), description="".format(cls.__name__)
+        )
+        group.add_argument(
+            "--model.classification.mit.mode",
+            type=str,
+            default=None,
+            choices=["xx_small", "x_small", "small"],
+            help="MIT mode",
+        )
+        group.add_argument(
+            "--model.classification.mit.attn-dropout",
+            type=float,
+            default=0.1,
+            help="Dropout in attention layer",
+        )
+        group.add_argument(
+            "--model.classification.mit.ffn-dropout",
+            type=float,
+            default=0.0,
+            help="Dropout between FFN layers",
+        )
+        group.add_argument(
+            "--model.classification.mit.dropout",
+            type=float,
+            default=0.1,
+            help="Dropout in Transformer layer",
+        )
+        group.add_argument(
+            "--model.classification.mit.transformer-norm-layer",
+            type=str,
+            default="layer_norm",
+            help="Normalization layer in transformer",
+        )
+        group.add_argument(
+            "--model.classification.mit.no-fuse-local-global-features",
+            action="store_true",
+            help="Do not combine local and global features in MIT block",
+        )
+        group.add_argument(
+            "--model.classification.mit.conv-kernel-size",
+            type=int,
+            default=3,
+            help="Kernel size of Conv layers in MIT block",
+        )
+
+        group.add_argument(
+            "--model.classification.mit.head-dim",
+            type=int,
+            default=None,
+            help="Head dimension in transformer",
+        )
+        group.add_argument(
+            "--model.classification.mit.number-heads",
+            type=int,
+            default=None,
+            help="No. of heads in transformer",
+        )
+        return parser
+
+    def _make_layer(
+        self, opts, input_channel, cfg: Dict, dilate: Optional[bool] = False
+    ) -> Tuple[nn.Sequential, int]:
+        block_type = cfg.get("block_type", "mobilevit")
+        if block_type.lower() == "mobilevit":
+            return self._make_mit_layer(
+                opts=opts, input_channel=input_channel, cfg=cfg, dilate=dilate
+            )
+        else:
+            return self._make_mobilenet_layer(
+                opts=opts, input_channel=input_channel, cfg=cfg
+            )
+
+    @staticmethod
+    def _make_mobilenet_layer(
+        opts, input_channel: int, cfg: Dict
+    ) -> Tuple[nn.Sequential, int]:
+        output_channels = cfg.get("out_channels")
+        num_blocks = cfg.get("num_blocks", 2)
+        expand_ratio = cfg.get("expand_ratio", 4)
+        block = []
+
+        for i in range(num_blocks):
+            stride = cfg.get("stride", 1) if i == 0 else 1
+
+            layer = InvertedResidual(
+                opts=opts,
+                in_channels=input_channel,
+                out_channels=output_channels,
+                stride=stride,
+                expand_ratio=expand_ratio,
+            )
+            block.append(layer)
+            input_channel = output_channels
+        return nn.Sequential(*block), input_channel
+
+    def _make_mit_layer(
+        self, opts, input_channel, cfg: Dict, dilate: Optional[bool] = False
+    ) -> Tuple[nn.Sequential, int]:
+        prev_dilation = self.dilation
+        block = []
+        stride = cfg.get("stride", 1)
+
+        if stride == 2:
+            if dilate:
+                self.dilation *= 2
+                stride = 1
+
+            layer = InvertedResidual(
+                opts=opts,
+                in_channels=input_channel,
+                out_channels=cfg.get("out_channels"),
+                stride=stride,
+                expand_ratio=cfg.get("mv_expand_ratio", 4),
+                dilation=prev_dilation,
+            )
+
+            block.append(layer)
+            input_channel = cfg.get("out_channels")
+
+        head_dim = cfg.get("head_dim", 32)
+        transformer_dim = cfg["transformer_channels"]
+        ffn_dim = cfg.get("ffn_dim")
+        if head_dim is None:
+            num_heads = cfg.get("num_heads", 4)
+            if num_heads is None:
+                num_heads = 4
+            head_dim = transformer_dim // num_heads
+
+        if transformer_dim % head_dim != 0:
+            logger.error(
+                "Transformer input dimension should be divisible by head dimension. "
+                "Got {} and {}.".format(transformer_dim, head_dim)
+            )
+
+        block.append(
+            MobileViTBlock(
+                opts=opts,
+                in_channels=input_channel,
+                transformer_dim=transformer_dim,
+                ffn_dim=ffn_dim,
+                n_transformer_blocks=cfg.get("transformer_blocks", 1),
+                patch_h=cfg.get("patch_h", 2),
+                patch_w=cfg.get("patch_w", 2),
+                dropout=getattr(opts, "model.classification.mit.dropout", 0.1),
+                ffn_dropout=getattr(opts, "model.classification.mit.ffn_dropout", 0.0),
+                attn_dropout=getattr(
+                    opts, "model.classification.mit.attn_dropout", 0.1
+                ),
+                head_dim=head_dim,
+                no_fusion=getattr(
+                    opts,
+                    "model.classification.mit.no_fuse_local_global_features",
+                    False,
+                ),
+                conv_ksize=getattr(
+                    opts, "model.classification.mit.conv_kernel_size", 3
+                ),
+            )
+        )
+
+        return nn.Sequential(*block), input_channel
diff --git a/yolov7/modeling/backbone/pvt_v2.py b/yolov7/modeling/backbone/pvt_v2.py
old mode 100755
new mode 100644
diff --git a/yolov7/modeling/backbone/regnet.py b/yolov7/modeling/backbone/regnet.py
old mode 100755
new mode 100644
diff --git a/yolov7/modeling/backbone/res2nets/__init__.py b/yolov7/modeling/backbone/res2nets/__init__.py
old mode 100755
new mode 100644
diff --git a/yolov7/modeling/backbone/res2nets/res2net.py b/yolov7/modeling/backbone/res2nets/res2net.py
old mode 100755
new mode 100644
diff --git a/yolov7/modeling/backbone/res2nets/res2net_v1b.py b/yolov7/modeling/backbone/res2nets/res2net_v1b.py
old mode 100755
new mode 100644
diff --git a/yolov7/modeling/backbone/res2nets/res2next.py b/yolov7/modeling/backbone/res2nets/res2next.py
old mode 100755
new mode 100644
diff --git a/yolov7/modeling/backbone/res2nets/wrapper.py b/yolov7/modeling/backbone/res2nets/wrapper.py
old mode 100755
new mode 100644
diff --git a/yolov7/modeling/backbone/resnetvd.py b/yolov7/modeling/backbone/resnetvd.py
new file mode 100644
index 0000000..6764539
--- /dev/null
+++ b/yolov7/modeling/backbone/resnetvd.py
@@ -0,0 +1,550 @@
+import math
+import torch.nn as nn
+from timm.models.resnet import BasicBlock, Bottleneck
+from timm.models.layers import DropBlock2d, DropPath, AvgPool2dSame
+
+from detectron2.layers import ShapeSpec, FrozenBatchNorm2d
+from detectron2.modeling import Backbone, BACKBONE_REGISTRY
+from detectron2.layers import NaiveSyncBatchNorm, DeformConv
+
+
+def get_padding(kernel_size, stride, dilation=1):
+    padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
+    return padding
+
+
+"""
+inplanes, planes, stride=1, downsample=None, cardinality=1, base_width=64,
+                 reduce_first=1, dilation=1, first_dilation=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d,
+                 attn_layer=None, aa_layer=None, drop_block=None, drop_path=None
+"""
+
+
+class DeformableBottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(
+        self,
+        inplanes,
+        planes,
+        stride=1,
+        downsample=None,
+        cardinality=1,
+        base_width=64,
+        reduce_first=1,
+        dilation=1,
+        first_dilation=None,
+        act_layer=nn.ReLU,
+        norm_layer=nn.BatchNorm2d,
+        attn_layer=None,
+        aa_layer=None,
+        drop_block=None,
+        drop_path=None,
+    ):
+        super().__init__()
+
+        width = int(math.floor(planes * (base_width / 64)) * cardinality)
+        first_planes = width // reduce_first
+        outplanes = planes * self.expansion
+        first_dilation = first_dilation or dilation
+        # use_aa = aa_layer is not None and (stride == 2 or first_dilation != dilation)
+
+        self.conv1 = nn.Conv2d(inplanes, first_planes, kernel_size=1, bias=False)
+        self.bn1 = norm_layer(first_planes)
+        self.act1 = act_layer(inplace=True)
+
+        self.conv2_offset = nn.Conv2d(
+            first_planes,
+            18,
+            kernel_size=3,
+            stride=stride,
+            padding=first_dilation,
+            dilation=first_dilation,
+        )
+        self.conv2 = DeformConv(
+            first_planes,
+            width,
+            kernel_size=3,
+            stride=stride,
+            padding=first_dilation,
+            bias=False,
+            dilation=first_dilation,
+        )
+
+        self.bn2 = norm_layer(width)
+        self.act2 = act_layer(inplace=True)
+        # self.aa = aa_layer(channels=width, stride=stride) if use_aa else None
+
+        self.conv3 = nn.Conv2d(width, outplanes, kernel_size=1, bias=False)
+        self.bn3 = norm_layer(outplanes)
+
+        # self.se = create_attn(attn_layer, outplanes)
+
+        self.act3 = act_layer(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        # self.drop_block = drop_block
+        # self.drop_path = drop_path
+
+        nn.init.constant_(self.conv2_offset.weight, 0)
+        nn.init.constant_(self.conv2_offset.bias, 0)
+
+    def zero_init_last_bn(self):
+        nn.init.zeros_(self.bn3.weight)
+
+    def forward(self, x):
+        shortcut = x
+
+        x = self.conv1(x)
+        x = self.bn1(x)
+
+        x = self.act1(x)
+
+        offset = self.conv2_offset(x)
+        x = self.conv2(x, offset)
+        x = self.bn2(x)
+        x = self.act2(x)
+
+        x = self.conv3(x)
+        x = self.bn3(x)
+
+        if self.downsample is not None:
+            shortcut = self.downsample(shortcut)
+        x += shortcut
+        x = self.act3(x)
+
+        return x
+
+
+BLOCK_TYPE = {
+    "basic": BasicBlock,
+    "bottleneck": Bottleneck,
+    "deform_bottleneck": DeformableBottleneck,
+}
+
+
+def downsample_conv(
+    in_channels,
+    out_channels,
+    kernel_size,
+    stride=1,
+    dilation=1,
+    first_dilation=None,
+    norm_layer=None,
+):
+    norm_layer = norm_layer or nn.BatchNorm2d
+    kernel_size = 1 if stride == 1 and dilation == 1 else kernel_size
+    first_dilation = (first_dilation or dilation) if kernel_size > 1 else 1
+    p = get_padding(kernel_size, stride, first_dilation)
+
+    return nn.Sequential(
+        *[
+            nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride=stride,
+                padding=p,
+                dilation=first_dilation,
+                bias=False,
+            ),
+            norm_layer(out_channels),
+        ]
+    )
+
+
+def downsample_avg(
+    in_channels,
+    out_channels,
+    kernel_size,
+    stride=1,
+    dilation=1,
+    first_dilation=None,
+    norm_layer=None,
+):
+    norm_layer = norm_layer or nn.BatchNorm2d
+    avg_stride = stride if dilation == 1 else 1
+    if stride == 1 and dilation == 1:
+        pool = nn.Identity()
+    else:
+        avg_pool_fn = (
+            AvgPool2dSame if avg_stride == 1 and dilation > 1 else nn.AvgPool2d
+        )
+        pool = avg_pool_fn(2, avg_stride, ceil_mode=True, count_include_pad=False)
+
+    return nn.Sequential(
+        *[
+            pool,
+            nn.Conv2d(in_channels, out_channels, 1, stride=1, padding=0, bias=False),
+            norm_layer(out_channels),
+        ]
+    )
+
+
+def drop_blocks(drop_block_rate=0.0):
+    return [
+        None,
+        None,
+        DropBlock2d(drop_block_rate, 5, 0.25) if drop_block_rate else None,
+        DropBlock2d(drop_block_rate, 3, 1.00) if drop_block_rate else None,
+    ]
+
+
+def make_blocks(
+    stage_block,
+    channels,
+    block_repeats,
+    inplanes,
+    reduce_first=1,
+    output_stride=32,
+    down_kernel_size=1,
+    avg_down=False,
+    drop_block_rate=0.0,
+    drop_path_rate=0.0,
+    **kwargs,
+):
+    stages = []
+    feature_info = []
+    net_num_blocks = sum(block_repeats)
+    net_block_idx = 0
+    net_stride = 4
+    dilation = prev_dilation = 1
+    for stage_idx, (planes, num_blocks, db) in enumerate(
+        zip(channels, block_repeats, drop_blocks(drop_block_rate))
+    ):
+        # choose block_fn through the BLOCK_TYPE
+        block_fn = BLOCK_TYPE[stage_block[stage_idx]]
+
+        stage_name = f"layer{stage_idx + 1}"  # never liked this name, but weight compat requires it
+        stride = 1 if stage_idx == 0 else 2
+        if net_stride >= output_stride:
+            dilation *= stride
+            stride = 1
+        else:
+            net_stride *= stride
+
+        downsample = None
+        if stride != 1 or inplanes != planes * block_fn.expansion:
+            down_kwargs = dict(
+                in_channels=inplanes,
+                out_channels=planes * block_fn.expansion,
+                kernel_size=down_kernel_size,
+                stride=stride,
+                dilation=dilation,
+                first_dilation=prev_dilation,
+                norm_layer=kwargs.get("norm_layer"),
+            )
+            downsample = (
+                downsample_avg(**down_kwargs)
+                if avg_down
+                else downsample_conv(**down_kwargs)
+            )
+
+        block_kwargs = dict(
+            reduce_first=reduce_first, dilation=dilation, drop_block=db, **kwargs
+        )
+        blocks = []
+        for block_idx in range(num_blocks):
+            downsample = downsample if block_idx == 0 else None
+            stride = stride if block_idx == 0 else 1
+            block_dpr = (
+                drop_path_rate * net_block_idx / (net_num_blocks - 1)
+            )  # stochastic depth linear decay rule
+            blocks.append(
+                block_fn(
+                    inplanes,
+                    planes,
+                    stride,
+                    downsample,
+                    first_dilation=prev_dilation,
+                    drop_path=DropPath(block_dpr) if block_dpr > 0.0 else None,
+                    **block_kwargs,
+                )
+            )
+            prev_dilation = dilation
+            inplanes = planes * block_fn.expansion
+            net_block_idx += 1
+
+        stages.append((stage_name, nn.Sequential(*blocks)))
+        feature_info.append(
+            dict(num_chs=inplanes, reduction=net_stride, module=stage_name)
+        )
+
+    return stages, feature_info
+
+
+class ResNet(Backbone):
+    """ResNet / ResNeXt / SE-ResNeXt / SE-Net
+
+    This class implements all variants of ResNet, ResNeXt, SE-ResNeXt, and SENet that
+      * have > 1 stride in the 3x3 conv layer of bottleneck
+      * have conv-bn-act ordering
+
+    This ResNet impl supports a number of stem and downsample options based on the v1c, v1d, v1e, and v1s
+    variants included in the MXNet Gluon ResNetV1b model. The C and D variants are also discussed in the
+    'Bag of Tricks' paper: https://arxiv.org/pdf/1812.01187. The B variant is equivalent to torchvision default.
+
+    ResNet variants (the same modifications can be used in SE/ResNeXt models as well):
+      * normal, b - 7x7 stem, stem_width = 64, same as torchvision ResNet, NVIDIA ResNet 'v1.5', Gluon v1b
+      * c - 3 layer deep 3x3 stem, stem_width = 32 (32, 32, 64)
+      * d - 3 layer deep 3x3 stem, stem_width = 32 (32, 32, 64), average pool in downsample
+      * e - 3 layer deep 3x3 stem, stem_width = 64 (64, 64, 128), average pool in downsample
+      * s - 3 layer deep 3x3 stem, stem_width = 64 (64, 64, 128)
+      * t - 3 layer deep 3x3 stem, stem width = 32 (24, 48, 64), average pool in downsample
+      * tn - 3 layer deep 3x3 stem, stem width = 32 (24, 32, 64), average pool in downsample
+
+    ResNeXt
+      * normal - 7x7 stem, stem_width = 64, standard cardinality and base widths
+      * same c,d, e, s variants as ResNet can be enabled
+
+    SE-ResNeXt
+      * normal - 7x7 stem, stem_width = 64
+      * same c, d, e, s variants as ResNet can be enabled
+
+    SENet-154 - 3 layer deep 3x3 stem (same as v1c-v1s), stem_width = 64, cardinality=64,
+        reduction by 2 on width of first bottleneck convolution, 3x3 downsample convs after first block
+
+    Parameters
+    ----------
+    block : Block
+        Class for the residual block. Options are BasicBlockGl, BottleneckGl.
+    layers : list of int
+        Numbers of layers in each block
+    num_classes : int, default 1000
+        Number of classification classes.
+    in_chans : int, default 3
+        Number of input (color) channels.
+    cardinality : int, default 1
+        Number of convolution groups for 3x3 conv in Bottleneck.
+    base_width : int, default 64
+        Factor determining bottleneck channels. `planes * base_width / 64 * cardinality`
+    stem_width : int, default 64
+        Number of channels in stem convolutions
+    stem_type : str, default ''
+        The type of stem:
+          * '', default - a single 7x7 conv with a width of stem_width
+          * 'deep' - three 3x3 convolution layers of widths stem_width, stem_width, stem_width * 2
+          * 'deep_tiered' - three 3x3 conv layers of widths stem_width//4 * 3, stem_width, stem_width * 2
+    block_reduce_first: int, default 1
+        Reduction factor for first convolution output width of residual blocks,
+        1 for all archs except senets, where 2
+    down_kernel_size: int, default 1
+        Kernel size of residual block downsampling path, 1x1 for most archs, 3x3 for senets
+    avg_down : bool, default False
+        Whether to use average pooling for projection skip connection between stages/downsample.
+    output_stride : int, default 32
+        Set the output stride of the network, 32, 16, or 8. Typically used in segmentation.
+    act_layer : nn.Module, activation layer
+    norm_layer : nn.Module, normalization layer
+    aa_layer : nn.Module, anti-aliasing layer
+    drop_rate : float, default 0.
+        Dropout probability before classifier, for training
+    global_pool : str, default 'avg'
+        Global pooling type. One of 'avg', 'max', 'avgmax', 'catavgmax'
+    """
+
+    def __init__(
+        self,
+        block_types,
+        layers,
+        in_chans=3,
+        cardinality=1,
+        base_width=64,
+        stem_width=64,
+        stem_type="",
+        replace_stem_pool=False,
+        output_stride=32,
+        block_reduce_first=1,
+        down_kernel_size=1,
+        avg_down=False,
+        act_layer=nn.ReLU,
+        norm_layer=nn.BatchNorm2d,
+        aa_layer=None,
+        drop_rate=0.0,
+        drop_path_rate=0.0,
+        drop_block_rate=0.0,
+        global_pool="avg",
+        zero_init_last_bn=True,
+        block_args=None,
+        out_features=None,
+    ):
+        block_args = block_args or dict()
+        assert output_stride in (8, 16, 32)
+        # self.num_classes = num_classes
+        self.drop_rate = drop_rate
+        super(ResNet, self).__init__()
+
+        # Stem
+        deep_stem = "deep" in stem_type
+        inplanes = stem_width * 2 if deep_stem else 64
+        if deep_stem:
+            stem_chs = (stem_width, stem_width)
+            if "tiered" in stem_type:
+                stem_chs = (3 * (stem_width // 4), stem_width)
+            self.conv1 = nn.Sequential(
+                *[
+                    nn.Conv2d(
+                        in_chans, stem_chs[0], 3, stride=2, padding=1, bias=False
+                    ),
+                    norm_layer(stem_chs[0]),
+                    act_layer(inplace=True),
+                    nn.Conv2d(
+                        stem_chs[0], stem_chs[1], 3, stride=1, padding=1, bias=False
+                    ),
+                    norm_layer(stem_chs[1]),
+                    act_layer(inplace=True),
+                    nn.Conv2d(
+                        stem_chs[1], inplanes, 3, stride=1, padding=1, bias=False
+                    ),
+                ]
+            )
+        else:
+            self.conv1 = nn.Conv2d(
+                in_chans, inplanes, kernel_size=7, stride=2, padding=3, bias=False
+            )
+        self.bn1 = norm_layer(inplanes)
+        self.act1 = act_layer(inplace=True)
+        self.feature_info = [dict(num_chs=inplanes, reduction=2, module="act1")]
+
+        # Stem Pooling
+        if replace_stem_pool:
+            self.maxpool = nn.Sequential(
+                *filter(
+                    None,
+                    [
+                        nn.Conv2d(
+                            inplanes,
+                            inplanes,
+                            3,
+                            stride=1 if aa_layer else 2,
+                            padding=1,
+                            bias=False,
+                        ),
+                        aa_layer(channels=inplanes, stride=2) if aa_layer else None,
+                        norm_layer(inplanes),
+                        act_layer(inplace=True),
+                    ],
+                )
+            )
+        else:
+            if aa_layer is not None:
+                self.maxpool = nn.Sequential(
+                    *[
+                        nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
+                        aa_layer(channels=inplanes, stride=2),
+                    ]
+                )
+            else:
+                self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+        # Feature Blocks
+        channels = [64, 128, 256, 512]
+        stage_modules, stage_feature_info = make_blocks(
+            block_types,
+            channels,
+            layers,
+            inplanes,
+            cardinality=cardinality,
+            base_width=base_width,
+            output_stride=output_stride,
+            reduce_first=block_reduce_first,
+            avg_down=avg_down,
+            down_kernel_size=down_kernel_size,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+            aa_layer=aa_layer,
+            drop_block_rate=drop_block_rate,
+            drop_path_rate=drop_path_rate,
+            **block_args,
+        )
+        for stage in stage_modules:
+            self.add_module(*stage)  # layer1, layer2, etc
+        self.feature_info.extend(stage_feature_info)
+
+        for n, m in self.named_modules():
+            if isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1.0)
+                nn.init.constant_(m.bias, 0.0)
+        if zero_init_last_bn:
+            for m in self.modules():
+                if hasattr(m, "zero_init_last_bn"):
+                    m.zero_init_last_bn()
+
+        out_features_names = ["res2", "res3", "res4", "res5"]
+        self._out_feature_strides = dict(zip(out_features_names, [4, 8, 16, 32]))
+        self._out_feature_channels = dict(
+            zip(
+                out_features_names,
+                [x * BLOCK_TYPE[block_types[0]].expansion for x in [64, 128, 256, 512]],
+            )
+        )
+        if out_features is None:
+            self._out_features = out_features_names
+        else:
+            self._out_features = out_features
+
+    def output_shape(self):
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name],
+                stride=self._out_feature_strides[name],
+            )
+            for name in self._out_features
+        }
+
+    def size_divisibility(self):
+        return 32
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.act1(x)
+        x = self.maxpool(x)
+        outputs = {}
+        x = self.layer1(x)
+        # outputs["res2"] = x
+        x = self.layer2(x)
+        outputs["res3"] = x
+        x = self.layer3(x)
+        outputs["res4"] = x
+        x = self.layer4(x)
+        outputs["res5"] = x
+        return outputs
+
+
+@BACKBONE_REGISTRY.register()
+def build_resnet_vd_backbone(cfg, input_shape):
+
+    depth = cfg.MODEL.RESNETS.DEPTH
+    norm_name = cfg.MODEL.RESNETS.NORM
+    if norm_name == "FrozenBN":
+        norm = FrozenBatchNorm2d
+    elif norm_name == "SyncBN":
+        norm = NaiveSyncBatchNorm
+    else:
+        norm = nn.BatchNorm2d
+    if depth == 50:
+        layers = [3, 4, 6, 3]
+    elif depth == 101:
+        layers = [3, 4, 23, 3]
+    else:
+        raise NotImplementedError()
+
+    stage_blocks = []
+    use_deformable = cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE
+    for idx in range(4):
+        if use_deformable[idx]:
+            stage_blocks.append("deform_bottleneck")
+        else:
+            stage_blocks.append("bottleneck")
+
+    model = ResNet(
+        stage_blocks,
+        layers,
+        stem_type="deep",
+        stem_width=32,
+        avg_down=True,
+        norm_layer=norm,
+    )
+    return model
diff --git a/yolov7/modeling/backbone/smcadetr_backbone.py b/yolov7/modeling/backbone/smcadetr_backbone.py
new file mode 100644
index 0000000..f82833f
--- /dev/null
+++ b/yolov7/modeling/backbone/smcadetr_backbone.py
@@ -0,0 +1,590 @@
+"""
+DETR Transformer class.
+
+Copy-paste from torch.nn.Transformer with modifications:
+    * positional encodings are passed in MHattention
+    * extra LN at the end of encoder is removed
+    * decoder returns a stack of activations from all decoding layers
+"""
+import copy
+from typing import Optional, List
+import math
+
+import torch
+import torch.nn.functional as F
+from torch import nn, Tensor
+from collections import OrderedDict
+from typing import Dict, List
+import torchvision
+
+from .layers.smca_attention import GaussianMultiheadAttention
+from yolov7.utils.misc import NestedTensor
+from yolov7.utils.misc import is_main_process
+from torch import nn
+from torchvision.models._utils import IntermediateLayerGetter
+
+
+class Transformer(nn.Module):
+
+    def __init__(self, d_model=512, nhead=8, num_encoder_layers=6,
+                 num_decoder_layers=6, dim_feedforward=2048, dropout=0.1,
+                 activation="relu", normalize_before=False,
+                 return_intermediate_dec=False, smooth=8, dynamic_scale=True):
+        super().__init__()
+
+        encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward,
+                                                dropout, activation, normalize_before)
+        encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
+        self.encoder = TransformerEncoder(
+            encoder_layer, num_encoder_layers, encoder_norm)
+
+        decoder_layers = []
+        for layer_index in range(num_decoder_layers):
+            decoder_layer = TransformerDecoderLayer(dynamic_scale, smooth, layer_index,
+                                                    d_model, nhead, dim_feedforward, dropout,
+                                                    activation, normalize_before)
+            decoder_layers.append(decoder_layer)
+        decoder_norm = nn.LayerNorm(d_model)
+        self.decoder = TransformerDecoder(decoder_layers, num_decoder_layers, decoder_norm,
+                                          return_intermediate=return_intermediate_dec)
+
+        self._reset_parameters()
+        if dynamic_scale in ["type2", "type3", "type4"]:
+            for layer_index in range(num_decoder_layers):
+                nn.init.zeros_(self.decoder.layers[layer_index].point3.weight)
+                with torch.no_grad():
+                    nn.init.ones_(self.decoder.layers[layer_index].point3.bias)
+
+        self.d_model = d_model
+        self.nhead = nhead
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+    def forward(self, src, mask, query_embed, pos_embed, h_w):
+        # flatten NxCxHxW to HWxNxC
+        bs, c, h, w = src.shape
+
+        grid_y, grid_x = torch.meshgrid(torch.arange(0, h), torch.arange(0, w))
+        grid = torch.stack((grid_x, grid_y), 2).float().to(src.device)
+        grid = grid.reshape(-1, 2).unsqueeze(1).repeat(1, bs * 8, 1)
+
+        src = src.flatten(2).permute(2, 0, 1)
+        pos_embed = pos_embed.flatten(2).permute(2, 0, 1)
+        query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1)
+        mask = mask.flatten(1)
+
+        tgt = torch.zeros_like(query_embed)
+        memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed)
+        hs, points = self.decoder(grid, h_w, tgt, memory, memory_key_padding_mask=mask,
+                                  pos=pos_embed, query_pos=query_embed)
+        return hs.transpose(1, 2), points.transpose(0, 1)
+
+
+class TransformerEncoder(nn.Module):
+
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super().__init__()
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+
+    def forward(self, src,
+                mask: Optional[Tensor] = None,
+                src_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None):
+        output = src
+
+        for layer in self.layers:
+            output = layer(output, src_mask=mask,
+                           src_key_padding_mask=src_key_padding_mask, pos=pos)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output
+
+
+class TransformerDecoder(nn.Module):
+
+    def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False):
+        super().__init__()
+        self.layers = nn.ModuleList(decoder_layer)
+        self.num_layers = num_layers
+        self.norm = norm
+        self.return_intermediate = return_intermediate
+
+    def forward(self, grid, h_w, tgt, memory,
+                tgt_mask: Optional[Tensor] = None,
+                memory_mask: Optional[Tensor] = None,
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                memory_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None,
+                query_pos: Optional[Tensor] = None):
+        output = tgt
+
+        intermediate = []
+
+        points = []
+        point_sigmoid_ref = None
+        for layer in self.layers:
+            output, point, point_sigmoid_ref = layer(
+                grid, h_w, output, memory, tgt_mask=tgt_mask,
+                memory_mask=memory_mask, tgt_key_padding_mask=tgt_key_padding_mask,
+                memory_key_padding_mask=memory_key_padding_mask,
+                pos=pos, query_pos=query_pos, point_ref_previous=point_sigmoid_ref
+            )
+            points.append(point)
+            if self.return_intermediate:
+                intermediate.append(self.norm(output))
+
+        if self.norm is not None:
+            output = self.norm(output)
+            if self.return_intermediate:
+                intermediate.pop()
+                intermediate.append(output)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate), points[0]
+
+        return output.unsqueeze(0)
+
+
+class TransformerEncoderLayer(nn.Module):
+
+    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
+                 activation="relu", normalize_before=False):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+
+    def forward_post(self,
+                     src,
+                     src_mask: Optional[Tensor] = None,
+                     src_key_padding_mask: Optional[Tensor] = None,
+                     pos: Optional[Tensor] = None):
+        q = k = self.with_pos_embed(src, pos)
+        src2 = self.self_attn(q, k, value=src, attn_mask=src_mask,
+                              key_padding_mask=src_key_padding_mask)[0]
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+        return src
+
+    def forward_pre(self, src,
+                    src_mask: Optional[Tensor] = None,
+                    src_key_padding_mask: Optional[Tensor] = None,
+                    pos: Optional[Tensor] = None):
+        src2 = self.norm1(src)
+        q = k = self.with_pos_embed(src2, pos)
+        src2 = self.self_attn(q, k, value=src2, attn_mask=src_mask,
+                              key_padding_mask=src_key_padding_mask)[0]
+        src = src + self.dropout1(src2)
+        src2 = self.norm2(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src2))))
+        src = src + self.dropout2(src2)
+        return src
+
+    def forward(self, src,
+                src_mask: Optional[Tensor] = None,
+                src_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None):
+        if self.normalize_before:
+            return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
+        return self.forward_post(src, src_mask, src_key_padding_mask, pos)
+
+
+class TransformerDecoderLayer(nn.Module):
+
+    def __init__(self, dynamic_scale, smooth, layer_index,
+                 d_model, nhead, dim_feedforward=2048, dropout=0.1,
+                 activation="relu", normalize_before=False):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        self.multihead_attn = GaussianMultiheadAttention(
+            d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.smooth = smooth
+        self.dynamic_scale = dynamic_scale
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.norm4 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.dropout3 = nn.Dropout(dropout)
+
+        if layer_index == 0:
+            self.point1 = MLP(256, 256, 2, 3)
+            self.point2 = nn.Linear(d_model, 2 * 8)
+        else:
+            self.point2 = nn.Linear(d_model, 2 * 8)
+        self.layer_index = layer_index
+        if self.dynamic_scale == "type2":
+            self.point3 = nn.Linear(d_model, 8)
+        elif self.dynamic_scale == "type3":
+            self.point3 = nn.Linear(d_model, 16)
+        elif self.dynamic_scale == "type4":
+            self.point3 = nn.Linear(d_model, 24)
+
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+
+    def forward_post(self, grid, h_w, tgt, memory,
+                     tgt_mask: Optional[Tensor] = None,
+                     memory_mask: Optional[Tensor] = None,
+                     tgt_key_padding_mask: Optional[Tensor] = None,
+                     memory_key_padding_mask: Optional[Tensor] = None,
+                     pos: Optional[Tensor] = None,
+                     query_pos: Optional[Tensor] = None,
+                     point_ref_previous: Optional[Tensor] = None):
+        tgt_len = tgt.shape[0]
+
+        out = self.norm4(tgt + query_pos)
+        point_sigmoid_offset = self.point2(out)
+
+        q = k = self.with_pos_embed(tgt, query_pos)
+        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask,
+                              key_padding_mask=tgt_key_padding_mask)[0]
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+
+        if self.layer_index == 0:
+            point_sigmoid_ref_inter = self.point1(out)
+            point_sigmoid_ref = point_sigmoid_ref_inter.sigmoid()
+            point_sigmoid_ref = (h_w - 0) * point_sigmoid_ref / 32
+            point_sigmoid_ref = point_sigmoid_ref.repeat(1, 1, 8)
+        else:
+            point_sigmoid_ref = point_ref_previous
+        point = point_sigmoid_ref + point_sigmoid_offset
+        point = point.view(tgt_len, -1, 2)
+        distance = (point.unsqueeze(1) - grid.unsqueeze(0)).pow(2)
+
+        if self.dynamic_scale == "type1":
+            scale = 1
+            distance = distance.sum(-1) * scale
+        elif self.dynamic_scale == "type2":
+            scale = self.point3(out)
+            scale = scale * scale
+            scale = scale.reshape(tgt_len, -1).unsqueeze(1)
+            distance = distance.sum(-1) * scale
+        elif self.dynamic_scale == "type3":
+            scale = self.point3(out)
+            scale = scale * scale
+            scale = scale.reshape(tgt_len, -1, 2).unsqueeze(1)
+            distance = (distance * scale).sum(-1)
+        elif self.dynamic_scale == "type4":
+            scale = self.point3(out)
+            scale = scale * scale
+            scale = scale.reshape(tgt_len, -1, 3).unsqueeze(1)
+            distance = torch.cat([distance, torch.prod(
+                distance, dim=-1, keepdim=True)], dim=-1)
+            distance = (distance * scale).sum(-1)
+
+        gaussian = -(distance - 0).abs() / self.smooth
+        tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos),
+                                   key=self.with_pos_embed(memory, pos),
+                                   value=memory, attn_mask=memory_mask,
+                                   key_padding_mask=memory_key_padding_mask,
+                                   gaussian=[gaussian])[0]
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout3(tgt2)
+        tgt = self.norm3(tgt)
+        if self.layer_index == 0:
+            return tgt, point_sigmoid_ref_inter, point_sigmoid_ref
+        else:
+            return tgt, None, point_sigmoid_ref
+
+    def forward_pre(self, tgt, memory,
+                    tgt_mask: Optional[Tensor] = None,
+                    memory_mask: Optional[Tensor] = None,
+                    tgt_key_padding_mask: Optional[Tensor] = None,
+                    memory_key_padding_mask: Optional[Tensor] = None,
+                    pos: Optional[Tensor] = None,
+                    query_pos: Optional[Tensor] = None):
+        tgt2 = self.norm1(tgt)
+        q = k = self.with_pos_embed(tgt2, query_pos)
+        tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask,
+                              key_padding_mask=tgt_key_padding_mask)[0]
+        tgt = tgt + self.dropout1(tgt2)
+        tgt2 = self.norm2(tgt)
+        tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos),
+                                   key=self.with_pos_embed(memory, pos),
+                                   value=memory, attn_mask=memory_mask,
+                                   key_padding_mask=memory_key_padding_mask)[0]
+        tgt = tgt + self.dropout2(tgt2)
+        tgt2 = self.norm3(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
+        tgt = tgt + self.dropout3(tgt2)
+        return tgt
+
+    def forward(self, grid, h_w, tgt, memory,
+                tgt_mask: Optional[Tensor] = None,
+                memory_mask: Optional[Tensor] = None,
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                memory_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None,
+                query_pos: Optional[Tensor] = None,
+                point_ref_previous: Optional[Tensor] = None):
+        if self.normalize_before:
+            return self.forward_pre(tgt, memory, tgt_mask, memory_mask,
+                                    tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
+        return self.forward_post(grid, h_w, tgt, memory, tgt_mask, memory_mask,
+                                 tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos,
+                                 point_ref_previous)
+
+
+class MLP(nn.Module):
+    """ Very simple multi-layer perceptron (also called FFN)"""
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k)
+                                    for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+class PositionEmbeddingSine(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention is all you need paper, generalized to work on images.
+    """
+
+    def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+        mask = tensor_list.mask
+        assert mask is not None
+        not_mask = ~mask
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+
+        dim_t = torch.arange(self.num_pos_feats,
+                             dtype=torch.float32, device=x.device)
+        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
+
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack(
+            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+
+
+class BackboneBase(nn.Module):
+    def __init__(
+        self, backbone: nn.Module, train_backbone: bool, return_interm_layers: bool
+    ):
+        super().__init__()
+        for name, parameter in backbone.named_parameters():
+            if (
+                not train_backbone
+                or "layer2" not in name
+                and "layer3" not in name
+                and "layer4" not in name
+            ):
+                parameter.requires_grad_(False)
+        if return_interm_layers:
+            return_layers = {"layer1": "0", "layer2": "1",
+                             "layer3": "2", "layer4": "3"}
+            # return_layers = {"layer2": "0", "layer3": "1", "layer4": "2"}
+            self.strides = [8, 16, 32]
+            self.num_channels = [512, 1024, 2048]
+        else:
+            return_layers = {"layer4": "0"}
+            self.strides = [32]
+            self.num_channels = [2048]
+        self.body = IntermediateLayerGetter(
+            backbone, return_layers=return_layers)
+
+    def forward(self, tensor_list: NestedTensor):
+        xs = self.body(tensor_list.tensors)
+        out: Dict[str, NestedTensor] = {}
+        for name, x in xs.items():
+            m = tensor_list.mask
+            assert m is not None
+            mask = F.interpolate(
+                m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
+            out[name] = NestedTensor(x, mask)
+        return out
+
+
+class FrozenBatchNorm2d(torch.nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt,
+    without which any other models than torchvision.models.resnet[18,34,50,101]
+    produce nans.
+    """
+
+    def __init__(self, n, eps=1e-5):
+        super(FrozenBatchNorm2d, self).__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+        self.eps = eps
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        num_batches_tracked_key = prefix + "num_batches_tracked"
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+
+        super(FrozenBatchNorm2d, self)._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it fuser-friendly
+        w = self.weight.reshape(1, -1, 1, 1)
+        b = self.bias.reshape(1, -1, 1, 1)
+        rv = self.running_var.reshape(1, -1, 1, 1)
+        rm = self.running_mean.reshape(1, -1, 1, 1)
+        eps = self.eps
+        scale = w * (rv + eps).rsqrt()
+        bias = b - rm * scale
+        return x * scale + bias
+
+
+class Backbone(BackboneBase):
+    """ResNet backbone with frozen BatchNorm."""
+
+    def __init__(
+        self,
+        name: str,
+        train_backbone: bool,
+        return_interm_layers: bool,
+        dilation: bool,
+    ):
+        norm_layer = FrozenBatchNorm2d
+        backbone = getattr(torchvision.models, name)(
+            replace_stride_with_dilation=[False, False, dilation],
+            pretrained=is_main_process(),
+            norm_layer=norm_layer,
+        )
+        assert name not in (
+            "resnet18", "resnet34"), "number of channels are hard coded"
+        super().__init__(backbone, train_backbone, return_interm_layers)
+        if dilation:
+            self.strides[-1] = self.strides[-1] // 2
+
+
+class Joiner(nn.Sequential):
+    def __init__(self, backbone, position_embedding):
+        super().__init__(backbone, position_embedding)
+        self.strides = backbone.strides
+        self.num_channels = backbone.num_channels
+
+    def forward(self, tensor_list: NestedTensor):
+        xs = self[0](tensor_list)
+        out: List[NestedTensor] = []
+        pos = []
+        for name, x in xs.items():
+            out.append(x)
+
+        # position encoding
+        for x in out:
+            pos.append(self[1](x).to(x.tensors.dtype))
+
+        # out: a list of NestedTensor
+        #   each tensor has shape (B, C, H, W)
+        #   each mask has shape (B, H, W)
+        # pos: a list of tensors, each has shape (B, C, H, W)
+        return out, pos
+
+
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+
+def build_transformer(args):
+    return Transformer(
+        d_model=args.hidden_dim,
+        dropout=args.dropout,
+        nhead=args.nheads,
+        dim_feedforward=args.dim_feedforward,
+        num_encoder_layers=args.enc_layers,
+        num_decoder_layers=args.dec_layers,
+        normalize_before=args.pre_norm,
+        return_intermediate_dec=True,
+        smooth=args.smooth,
+        dynamic_scale=args.dynamic_scale,
+    )
+
+
+def _get_activation_fn(activation):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
diff --git a/yolov7/modeling/backbone/swin_transformer.py b/yolov7/modeling/backbone/swin_transformer.py
old mode 100755
new mode 100644
diff --git a/yolov7/modeling/backbone/volo.py b/yolov7/modeling/backbone/volo.py
old mode 100755
new mode 100644
diff --git a/yolov7/modeling/backbone/yolov5_backbone.py b/yolov7/modeling/backbone/yolov5_backbone.py
old mode 100755
new mode 100644
diff --git a/yolov7/modeling/head/__init__.py b/yolov7/modeling/head/__init__.py
old mode 100755
new mode 100644
diff --git a/yolov7/modeling/head/box_regression.py b/yolov7/modeling/head/box_regression.py
old mode 100755
new mode 100644
diff --git a/yolov7/modeling/head/decoder.py b/yolov7/modeling/head/decoder.py
old mode 100755
new mode 100644
diff --git a/yolov7/modeling/head/effidehead.py b/yolov7/modeling/head/effidehead.py
new file mode 100644
index 0000000..720375f
--- /dev/null
+++ b/yolov7/modeling/head/effidehead.py
@@ -0,0 +1,208 @@
+import torch
+import torch.nn as nn
+import math
+from ..backbone.efficientrep import *
+
+
+class EffiDeHead(nn.Module):
+    '''Efficient Decoupled Head'''
+    def __init__(self, num_classes=80, anchors=1, num_layers=3, inplace=True, head_layers=None):  # detection layer
+        super().__init__()
+        assert head_layers is not None
+        self.num_classes = num_classes  # number of classes
+        self.num_outputs = num_classes + 5  # number of outputs per anchor
+        self.num_layers = num_layers  # number of detection layers
+        if isinstance(anchors, (list, tuple)):
+            self.num_anchors = len(anchors[0]) // 2
+        else:
+            self.num_anchors = anchors
+        self.anchors = anchors
+        self.grid = [torch.zeros(1)] * self.num_layers
+        self.prior_prob = 1e-2
+        self.inplace = inplace
+        stride = [8, 16, 32]  # strides computed during build
+        self.stride = torch.tensor(stride)
+
+        # Init decouple head
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        self.cls_preds = nn.ModuleList()
+        self.reg_preds = nn.ModuleList()
+        self.obj_preds = nn.ModuleList()
+        self.stems = nn.ModuleList()
+
+        # Efficient decoupled head layers
+        for i in range(num_layers):
+            idx = i*6
+            self.stems.append(head_layers[idx])
+            self.cls_convs.append(head_layers[idx+1])
+            self.reg_convs.append(head_layers[idx+2])
+            self.cls_preds.append(head_layers[idx+3])
+            self.reg_preds.append(head_layers[idx+4])
+            self.obj_preds.append(head_layers[idx+5])
+
+    def initialize_biases(self):
+        for conv in self.cls_preds:
+            b = conv.bias.view(self.num_anchors, -1)
+            b.data.fill_(-math.log((1 - self.prior_prob) / self.prior_prob))
+            conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
+        for conv in self.obj_preds:
+            b = conv.bias.view(self.num_anchors, -1)
+            b.data.fill_(-math.log((1 - self.prior_prob) / self.prior_prob))
+            conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
+
+    def forward(self, x):
+        z = []
+        for i in range(self.num_layers):
+            x[i] = self.stems[i](x[i])
+            cls_x = x[i]
+            reg_x = x[i]
+            cls_feat = self.cls_convs[i](cls_x)
+            cls_output = self.cls_preds[i](cls_feat)
+            reg_feat = self.reg_convs[i](reg_x)
+            reg_output = self.reg_preds[i](reg_feat)
+            obj_output = self.obj_preds[i](reg_feat)
+            if self.training:
+                x[i] = torch.cat([reg_output, obj_output, cls_output], 1)
+                bs, _, ny, nx = x[i].shape
+                x[i] = x[i].view(bs, self.num_anchors, self.num_outputs, ny, nx).permute(0, 1, 3, 4, 2).contiguous()
+            else:
+                y = torch.cat([reg_output, obj_output.sigmoid(), cls_output.sigmoid()], 1)
+                bs, _, ny, nx = y.shape
+                y = y.view(bs, self.num_anchors, self.num_outputs, ny, nx).permute(0, 1, 3, 4, 2).contiguous()
+                if self.grid[i].shape[2:4] != y.shape[2:4]:
+                    d = self.stride.device
+                    yv, xv = torch.meshgrid([torch.arange(ny).to(d), torch.arange(nx).to(d)])
+                    self.grid[i] = torch.stack((xv, yv), 2).view(1, self.num_anchors, ny, nx, 2).float()
+                if self.inplace:
+                    y[..., 0:2] = (y[..., 0:2] + self.grid[i]) * self.stride[i]  # xy
+                    y[..., 2:4] = torch.exp(y[..., 2:4]) * self.stride[i] # wh
+                else:  
+                    xy = (y[..., 0:2] + self.grid[i]) * self.stride[i]  # xy
+                    wh = torch.exp(y[..., 2:4]) * self.stride[i]  # wh
+                    y = torch.cat((xy, wh, y[..., 4:]), -1)
+                z.append(y.view(bs, -1, self.num_outputs))    
+        return x if self.training else torch.cat(z, 1)
+
+
+def build_effidehead_layer(channels_list, num_anchors, num_classes):
+    head_layers = nn.Sequential(
+        # stem0
+        Conv(
+            in_channels=channels_list[6],
+            out_channels=channels_list[6],
+            kernel_size=1,
+            stride=1
+        ),
+        # cls_conv0
+        Conv(
+            in_channels=channels_list[6],
+            out_channels=channels_list[6],
+            kernel_size=3,
+            stride=1
+        ),
+        # reg_conv0
+        Conv(
+            in_channels=channels_list[6],
+            out_channels=channels_list[6],
+            kernel_size=3,
+            stride=1
+        ),
+        # cls_pred0
+        nn.Conv2d(
+            in_channels=channels_list[6],
+            out_channels=num_classes * num_anchors,
+            kernel_size=1
+        ),
+        # reg_pred0
+        nn.Conv2d(
+            in_channels=channels_list[6],
+            out_channels=4 * num_anchors,
+            kernel_size=1
+        ),
+        # obj_pred0
+        nn.Conv2d(
+            in_channels=channels_list[6],
+            out_channels=1 * num_anchors,
+            kernel_size=1
+        ),
+        # stem1
+        Conv(
+            in_channels=channels_list[8],
+            out_channels=channels_list[8],
+            kernel_size=1,
+            stride=1
+        ),
+        # cls_conv1
+        Conv(
+            in_channels=channels_list[8],
+            out_channels=channels_list[8],
+            kernel_size=3,
+            stride=1
+        ),
+        # reg_conv1
+        Conv(
+            in_channels=channels_list[8],
+            out_channels=channels_list[8],
+            kernel_size=3,
+            stride=1
+        ),
+        # cls_pred1
+        nn.Conv2d(
+            in_channels=channels_list[8],
+            out_channels=num_classes * num_anchors,
+            kernel_size=1
+        ),
+        # reg_pred1
+        nn.Conv2d(
+            in_channels=channels_list[8],
+            out_channels=4 * num_anchors,
+            kernel_size=1
+        ),
+        # obj_pred1
+        nn.Conv2d(
+            in_channels=channels_list[8],
+            out_channels=1 * num_anchors,
+            kernel_size=1
+        ),
+        # stem2
+        Conv(
+            in_channels=channels_list[10],
+            out_channels=channels_list[10],
+            kernel_size=1,
+            stride=1
+        ),
+        # cls_conv2
+        Conv(
+            in_channels=channels_list[10],
+            out_channels=channels_list[10],
+            kernel_size=3,
+            stride=1
+        ),
+        # reg_conv2
+        Conv(
+            in_channels=channels_list[10],
+            out_channels=channels_list[10],
+            kernel_size=3,
+            stride=1
+        ),
+        # cls_pred2
+        nn.Conv2d(
+            in_channels=channels_list[10],
+            out_channels=num_classes * num_anchors,
+            kernel_size=1
+        ),
+        # reg_pred2
+        nn.Conv2d(
+            in_channels=channels_list[10],
+            out_channels=4 * num_anchors,
+            kernel_size=1
+        ),
+        # obj_pred2
+        nn.Conv2d(
+            in_channels=channels_list[10],
+            out_channels=1 * num_anchors,
+            kernel_size=1
+        )
+    )
+    return head_layers
diff --git a/yolov7/modeling/head/encoder.py b/yolov7/modeling/head/encoder.py
old mode 100755
new mode 100644
diff --git a/yolov7/modeling/head/sem_seg_head.py b/yolov7/modeling/head/sem_seg_head.py
old mode 100755
new mode 100644
diff --git a/yolov7/modeling/head/solov2_head.py b/yolov7/modeling/head/solov2_head.py
new file mode 100644
index 0000000..50b54f1
--- /dev/null
+++ b/yolov7/modeling/head/solov2_head.py
@@ -0,0 +1,304 @@
+# -*- coding: utf-8 -*-
+import logging
+import math
+from typing import List
+from alfred.dl.torch.common import print_tensor
+import numpy as np
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from detectron2.layers import ShapeSpec, batched_nms, cat, paste_masks_in_image
+from detectron2.modeling.anchor_generator import DefaultAnchorGenerator
+from detectron2.modeling.backbone import build_backbone
+from detectron2.modeling.box_regression import Box2BoxTransform
+from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
+from detectron2.structures import Boxes, ImageList, Instances
+from detectron2.utils.logger import log_first_n
+
+
+class SOLOv2InsHead(nn.Module):
+    def __init__(self, cfg, input_shape: List[ShapeSpec]):
+        """
+        SOLOv2 Instance Head.
+        """
+        super().__init__()
+        # fmt: off
+        self.num_classes = cfg.MODEL.SOLOV2.NUM_CLASSES
+        self.num_kernels = cfg.MODEL.SOLOV2.NUM_KERNELS
+        self.num_grids = cfg.MODEL.SOLOV2.NUM_GRIDS
+        self.instance_in_features = cfg.MODEL.SOLOV2.INSTANCE_IN_FEATURES
+        self.instance_strides = cfg.MODEL.SOLOV2.FPN_INSTANCE_STRIDES
+        # = fpn.
+        self.instance_in_channels = cfg.MODEL.SOLOV2.INSTANCE_IN_CHANNELS
+        self.instance_channels = cfg.MODEL.SOLOV2.INSTANCE_CHANNELS
+        # Convolutions to use in the towers
+        self.type_dcn = cfg.MODEL.SOLOV2.TYPE_DCN
+        self.num_levels = len(self.instance_in_features)
+        assert self.num_levels == len(self.instance_strides), \
+            print("Strides should match the features.")
+        # fmt: on
+
+        head_configs = {
+            "cate": (
+                cfg.MODEL.SOLOV2.NUM_INSTANCE_CONVS,
+                cfg.MODEL.SOLOV2.USE_DCN_IN_INSTANCE,
+                False,
+            ),
+            "kernel": (
+                cfg.MODEL.SOLOV2.NUM_INSTANCE_CONVS,
+                cfg.MODEL.SOLOV2.USE_DCN_IN_INSTANCE,
+                cfg.MODEL.SOLOV2.USE_COORD_CONV,
+            ),
+        }
+
+        norm = None if cfg.MODEL.SOLOV2.NORM == "none" else cfg.MODEL.SOLOV2.NORM
+        in_channels = [s.channels for s in input_shape]
+        assert len(set(in_channels)) == 1, print(
+            "Each level must have the same channel!"
+        )
+        in_channels = in_channels[0]
+        assert in_channels == cfg.MODEL.SOLOV2.INSTANCE_IN_CHANNELS, print(
+            "In channels should equal to tower in channels!"
+        )
+
+        for head in head_configs:
+            tower = []
+            num_convs, use_deformable, use_coord = head_configs[head]
+            for i in range(num_convs):
+                conv_func = nn.Conv2d
+                if i == 0:
+                    if use_coord:
+                        chn = self.instance_in_channels + 2
+                    else:
+                        chn = self.instance_in_channels
+                else:
+                    chn = self.instance_channels
+
+                tower.append(
+                    conv_func(
+                        chn,
+                        self.instance_channels,
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                        bias=norm is None,
+                    )
+                )
+                if norm == "GN":
+                    tower.append(nn.GroupNorm(32, self.instance_channels))
+                tower.append(nn.ReLU(inplace=True))
+            self.add_module("{}_tower".format(head), nn.Sequential(*tower))
+
+        self.cate_pred = nn.Conv2d(
+            self.instance_channels, self.num_classes, kernel_size=3, stride=1, padding=1
+        )
+        self.kernel_pred = nn.Conv2d(
+            self.instance_channels, self.num_kernels, kernel_size=3, stride=1, padding=1
+        )
+
+        for modules in [
+            self.cate_tower,
+            self.kernel_tower,
+            self.cate_pred,
+            self.kernel_pred,
+        ]:
+            for l in modules.modules():
+                if isinstance(l, nn.Conv2d):
+                    torch.nn.init.normal_(l.weight, std=0.01)
+                    if l.bias is not None:
+                        nn.init.constant_(l.bias, 0)
+
+        # initialize the bias for focal loss
+        prior_prob = cfg.MODEL.SOLOV2.PRIOR_PROB
+        bias_value = -math.log((1 - prior_prob) / prior_prob)
+        torch.nn.init.constant_(self.cate_pred.bias, bias_value)
+
+    def forward(self, features):
+        """
+        Arguments:
+            features (list[Tensor]): FPN feature map tensors in high to low resolution.
+                Each tensor in the list correspond to different feature levels.
+
+        Returns:
+            pass
+        """
+        cate_pred = []
+        kernel_pred = []
+
+        for idx, feature in enumerate(features):
+            ins_kernel_feat = feature
+            # concat coord
+            x_range = torch.linspace(
+                -1, 1, ins_kernel_feat.shape[-1], device=ins_kernel_feat.device
+            )
+            y_range = torch.linspace(
+                -1, 1, ins_kernel_feat.shape[-2], device=ins_kernel_feat.device
+            )
+            y, x = torch.meshgrid(y_range, x_range)
+            y = y.expand([ins_kernel_feat.shape[0], 1, -1, -1])
+            x = x.expand([ins_kernel_feat.shape[0], 1, -1, -1])
+            coord_feat = torch.cat([x, y], 1)
+            ins_kernel_feat = torch.cat([ins_kernel_feat, coord_feat], 1)
+
+            # individual feature.
+            kernel_feat = ins_kernel_feat
+            seg_num_grid = self.num_grids[idx]
+            kernel_feat = F.interpolate(kernel_feat, size=seg_num_grid, mode="bilinear")
+            cate_feat = kernel_feat[:, :-2, :, :]
+
+            # kernel
+            kernel_feat = self.kernel_tower(kernel_feat)
+            kernel_pred.append(self.kernel_pred(kernel_feat))
+
+            # cate
+            cate_feat = self.cate_tower(cate_feat)
+            cate_pred.append(self.cate_pred(cate_feat))
+        return cate_pred, kernel_pred
+
+
+class SOLOv2MaskHead(nn.Module):
+    def __init__(self, cfg, input_shape: List[ShapeSpec]):
+        """
+        SOLOv2 Mask Head.
+        """
+        super().__init__()
+        # fmt: off
+        self.mask_on = cfg.MODEL.MASK_ON
+        self.num_masks = cfg.MODEL.SOLOV2.NUM_MASKS
+        self.mask_in_features = cfg.MODEL.SOLOV2.MASK_IN_FEATURES
+        self.mask_in_channels = cfg.MODEL.SOLOV2.MASK_IN_CHANNELS
+        self.mask_channels = cfg.MODEL.SOLOV2.MASK_CHANNELS
+        self.num_levels = len(input_shape)
+        assert self.num_levels == len(self.mask_in_features), \
+            print("Input shape should match the features.")
+        # fmt: on
+        norm = None if cfg.MODEL.SOLOV2.NORM == "none" else cfg.MODEL.SOLOV2.NORM
+
+        self.convs_all_levels = nn.ModuleList()
+        for i in range(self.num_levels):
+            convs_per_level = nn.Sequential()
+            if i == 0:
+                conv_tower = list()
+                conv_tower.append(
+                    nn.Conv2d(
+                        self.mask_in_channels,
+                        self.mask_channels,
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                        bias=norm is None,
+                    )
+                )
+                if norm == "GN":
+                    conv_tower.append(nn.GroupNorm(32, self.mask_channels))
+                conv_tower.append(nn.ReLU(inplace=False))
+                convs_per_level.add_module("conv" + str(i), nn.Sequential(*conv_tower))
+                self.convs_all_levels.append(convs_per_level)
+                continue
+
+            for j in range(i):
+                if j == 0:
+                    chn = self.mask_in_channels + 2 if i == 3 else self.mask_in_channels
+                    conv_tower = list()
+                    conv_tower.append(
+                        nn.Conv2d(
+                            chn,
+                            self.mask_channels,
+                            kernel_size=3,
+                            stride=1,
+                            padding=1,
+                            bias=norm is None,
+                        )
+                    )
+                    if norm == "GN":
+                        conv_tower.append(nn.GroupNorm(32, self.mask_channels))
+                    conv_tower.append(nn.ReLU(inplace=False))
+                    convs_per_level.add_module(
+                        "conv" + str(j), nn.Sequential(*conv_tower)
+                    )
+                    upsample_tower = nn.Upsample(
+                        scale_factor=2, mode="bilinear", align_corners=False
+                    )
+                    convs_per_level.add_module("upsample" + str(j), upsample_tower)
+                    continue
+                conv_tower = list()
+                conv_tower.append(
+                    nn.Conv2d(
+                        self.mask_channels,
+                        self.mask_channels,
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                        bias=norm is None,
+                    )
+                )
+                if norm == "GN":
+                    conv_tower.append(nn.GroupNorm(32, self.mask_channels))
+                conv_tower.append(nn.ReLU(inplace=False))
+                convs_per_level.add_module("conv" + str(j), nn.Sequential(*conv_tower))
+                upsample_tower = nn.Upsample(
+                    scale_factor=2, mode="bilinear", align_corners=False
+                )
+                convs_per_level.add_module("upsample" + str(j), upsample_tower)
+
+            self.convs_all_levels.append(convs_per_level)
+
+        self.conv_pred = nn.Sequential(
+            nn.Conv2d(
+                self.mask_channels,
+                self.num_masks,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=norm is None,
+            ),
+            nn.GroupNorm(32, self.num_masks),
+            nn.ReLU(inplace=True),
+        )
+
+        for modules in [self.convs_all_levels, self.conv_pred]:
+            for l in modules.modules():
+                if isinstance(l, nn.Conv2d):
+                    torch.nn.init.normal_(l.weight, std=0.01)
+                    if l.bias is not None:
+                        nn.init.constant_(l.bias, 0)
+
+    def forward(self, features):
+        """
+        Arguments:
+            features (list[Tensor]): FPN feature map tensors in high to low resolution.
+                Each tensor in the list correspond to different feature levels.
+
+        Returns:
+            pass
+        """
+        assert len(features) == self.num_levels, print(
+            "The number of input features should be equal to the supposed level."
+        )
+
+        # bottom features first.
+        feature_add_all_level = self.convs_all_levels[0](features[0])
+        for i in range(1, self.num_levels):
+            mask_feat = features[i]
+            if i == 3:  # add for coord.
+                x_range = torch.linspace(
+                    -1, 1, mask_feat.shape[-1], device=mask_feat.device
+                )
+                y_range = torch.linspace(
+                    -1, 1, mask_feat.shape[-2], device=mask_feat.device
+                )
+                y, x = torch.meshgrid(y_range, x_range)
+                y = y.expand([mask_feat.shape[0], 1, -1, -1])
+                x = x.expand([mask_feat.shape[0], 1, -1, -1])
+                coord_feat = torch.cat([x, y], 1)
+                mask_feat = torch.cat([mask_feat, coord_feat], 1)
+            # add for top features.
+            # feature_add_all_level += self.convs_all_levels[i](mask_feat)
+            # maybe issue of: https://github.com/jinfagang/yolov7/issues/30
+            feature_add_all_level = feature_add_all_level + self.convs_all_levels[i](
+                mask_feat
+            )
+        mask_pred = self.conv_pred(feature_add_all_level)
+        return mask_pred
diff --git a/yolov7/modeling/head/uniform_matcher.py b/yolov7/modeling/head/uniform_matcher.py
old mode 100755
new mode 100644
diff --git a/yolov7/modeling/head/yolov6_head.py b/yolov7/modeling/head/yolov6_head.py
new file mode 100644
index 0000000..cd3366b
--- /dev/null
+++ b/yolov7/modeling/head/yolov6_head.py
@@ -0,0 +1,754 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+
+from cProfile import label
+from loguru import logger
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from yolov7.utils.boxes import bboxes_iou, IOUlossV6, pairwise_bbox_iou
+
+from ..backbone.efficientrep import Conv
+import math
+import numpy as np
+from alfred import print_shape
+
+"""
+
+yolov6 head with decoupled design
+
+"""
+
+def build_effidehead_layer(channels_list, num_anchors, num_classes):
+    head_layers = nn.Sequential(
+        # stem0
+        Conv(
+            in_channels=channels_list[6],
+            out_channels=channels_list[6],
+            kernel_size=1,
+            stride=1,
+        ),
+        # cls_conv0
+        Conv(
+            in_channels=channels_list[6],
+            out_channels=channels_list[6],
+            kernel_size=3,
+            stride=1,
+        ),
+        # reg_conv0
+        Conv(
+            in_channels=channels_list[6],
+            out_channels=channels_list[6],
+            kernel_size=3,
+            stride=1,
+        ),
+        # cls_pred0
+        nn.Conv2d(
+            in_channels=channels_list[6],
+            out_channels=num_classes * num_anchors,
+            kernel_size=1,
+        ),
+        # reg_pred0
+        nn.Conv2d(
+            in_channels=channels_list[6],
+            out_channels=4 * num_anchors,
+            kernel_size=1,
+        ),
+        # obj_pred0
+        nn.Conv2d(
+            in_channels=channels_list[6],
+            out_channels=1 * num_anchors,
+            kernel_size=1,
+        ),
+        # stem1
+        Conv(
+            in_channels=channels_list[8],
+            out_channels=channels_list[8],
+            kernel_size=1,
+            stride=1,
+        ),
+        # cls_conv1
+        Conv(
+            in_channels=channels_list[8],
+            out_channels=channels_list[8],
+            kernel_size=3,
+            stride=1,
+        ),
+        # reg_conv1
+        Conv(
+            in_channels=channels_list[8],
+            out_channels=channels_list[8],
+            kernel_size=3,
+            stride=1,
+        ),
+        # cls_pred1
+        nn.Conv2d(
+            in_channels=channels_list[8],
+            out_channels=num_classes * num_anchors,
+            kernel_size=1,
+        ),
+        # reg_pred1
+        nn.Conv2d(
+            in_channels=channels_list[8],
+            out_channels=4 * num_anchors,
+            kernel_size=1,
+        ),
+        # obj_pred1
+        nn.Conv2d(
+            in_channels=channels_list[8],
+            out_channels=1 * num_anchors,
+            kernel_size=1,
+        ),
+        # stem2
+        Conv(
+            in_channels=channels_list[10],
+            out_channels=channels_list[10],
+            kernel_size=1,
+            stride=1,
+        ),
+        # cls_conv2
+        Conv(
+            in_channels=channels_list[10],
+            out_channels=channels_list[10],
+            kernel_size=3,
+            stride=1,
+        ),
+        # reg_conv2
+        Conv(
+            in_channels=channels_list[10],
+            out_channels=channels_list[10],
+            kernel_size=3,
+            stride=1,
+        ),
+        # cls_pred2
+        nn.Conv2d(
+            in_channels=channels_list[10],
+            out_channels=num_classes * num_anchors,
+            kernel_size=1,
+        ),
+        # reg_pred2
+        nn.Conv2d(
+            in_channels=channels_list[10],
+            out_channels=4 * num_anchors,
+            kernel_size=1,
+        ),
+        # obj_pred2
+        nn.Conv2d(
+            in_channels=channels_list[10],
+            out_channels=1 * num_anchors,
+            kernel_size=1,
+        ),
+    )
+    return head_layers
+
+
+class Detect(nn.Module):
+    """Efficient Decoupled Head
+    With hardware-aware degisn, the decoupled head is optimized with
+    hybridchannels methods.
+    """
+
+    def __init__(
+        self, num_classes=80, anchors=1, num_layers=3, inplace=True, head_layers=None
+    ):  # detection layer
+        super().__init__()
+        assert head_layers is not None
+        self.nc = num_classes  # number of classes
+        self.no = num_classes + 5  # number of outputs per anchor
+        self.nl = num_layers  # number of detection layers
+        if isinstance(anchors, (list, tuple)):
+            self.na = len(anchors[0]) // 2
+        else:
+            self.na = anchors
+        self.anchors = anchors
+        self.grid = [torch.zeros(1)] * num_layers
+        self.prior_prob = 1e-2
+        self.inplace = inplace
+        stride = [8, 16, 32]  # strides computed during build
+        self.stride = torch.tensor(stride)
+
+        # Init decouple head
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        self.cls_preds = nn.ModuleList()
+        self.reg_preds = nn.ModuleList()
+        self.obj_preds = nn.ModuleList()
+        self.stems = nn.ModuleList()
+
+        # Efficient decoupled head layers
+        for i in range(num_layers):
+            idx = i * 6
+            self.stems.append(head_layers[idx])
+            self.cls_convs.append(head_layers[idx + 1])
+            self.reg_convs.append(head_layers[idx + 2])
+            self.cls_preds.append(head_layers[idx + 3])
+            self.reg_preds.append(head_layers[idx + 4])
+            self.obj_preds.append(head_layers[idx + 5])
+
+    def initialize_biases(self):
+        for conv in self.cls_preds:
+            b = conv.bias.view(self.na, -1)
+            b.data.fill_(-math.log((1 - self.prior_prob) / self.prior_prob))
+            conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
+        for conv in self.obj_preds:
+            b = conv.bias.view(self.na, -1)
+            b.data.fill_(-math.log((1 - self.prior_prob) / self.prior_prob))
+            conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
+
+    def forward(self, x):
+        z = []
+        for i in range(self.nl):
+            x[i] = self.stems[i](x[i])
+            cls_x = x[i]
+            reg_x = x[i]
+            cls_feat = self.cls_convs[i](cls_x)
+            cls_output = self.cls_preds[i](cls_feat)
+            reg_feat = self.reg_convs[i](reg_x)
+            reg_output = self.reg_preds[i](reg_feat)
+            obj_output = self.obj_preds[i](reg_feat)
+            if self.training:
+                x[i] = torch.cat([reg_output, obj_output, cls_output], 1)
+                bs, _, ny, nx = x[i].shape
+                x[i] = (
+                    x[i]
+                    .view(bs, self.na, self.no, ny, nx)
+                    .permute(0, 1, 3, 4, 2)
+                    .contiguous()
+                )
+            else:
+                y = torch.cat(
+                    [reg_output, obj_output.sigmoid(), cls_output.sigmoid()], 1
+                )
+                bs, _, ny, nx = y.shape
+                y = (
+                    y.view(bs, self.na, self.no, ny, nx)
+                    .permute(0, 1, 3, 4, 2)
+                    .contiguous()
+                )
+                if self.grid[i].shape[2:4] != y.shape[2:4]:
+                    d = self.stride.device
+                    yv, xv = torch.meshgrid(
+                        [torch.arange(ny).to(d), torch.arange(nx).to(d)]
+                    )
+                    self.grid[i] = (
+                        torch.stack((xv, yv), 2).view(1, self.na, ny, nx, 2).float()
+                    )
+                if self.inplace:
+                    y[..., 0:2] = (y[..., 0:2] + self.grid[i]) * self.stride[i]  # xy
+                    y[..., 2:4] = torch.exp(y[..., 2:4]) * self.stride[i]  # wh
+                else:
+                    xy = (y[..., 0:2] + self.grid[i]) * self.stride[i]  # xy
+                    wh = torch.exp(y[..., 2:4]) * self.stride[i]  # wh
+                    y = torch.cat((xy, wh, y[..., 4:]), -1)
+                z.append(y.view(bs, -1, self.no))
+        return x if self.training else torch.cat(z, 1)
+
+
+class YOLOv6Head(nn.Module):
+    def __init__(
+        self,
+        num_classes,
+        anchors=1,
+        num_layers=3,
+        channels_list=[256, 512, 1024],
+    ):
+        """
+        Args:
+            act (str): activation type of conv. Defalut value: "silu".
+            depthwise (bool): wheather apply depthwise conv in conv branch. Defalut value: False.
+        """
+        super().__init__()
+
+        self.num_anchors = anchors
+        self.num_classes = num_classes
+        self.decode_in_inference = True  # for deploy, set to False
+
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        self.cls_preds = nn.ModuleList()
+        self.reg_preds = nn.ModuleList()
+        self.obj_preds = nn.ModuleList()
+        self.stems = nn.ModuleList()
+
+        self.channels_list = channels_list
+        head_layers = build_effidehead_layer(channels_list, self.num_anchors, num_classes)
+        self.det_head = Detect(num_classes, anchors, num_layers, head_layers=head_layers)
+
+        self.use_l1 = False
+        self.compute_loss = ComputeLoss(iou_type='ciou')
+        self.onnx_export = False
+
+    
+    def initialize_biases(self, prior_prob):
+        for conv in self.cls_preds:
+            b = conv.bias.view(self.n_anchors, -1)
+            b.data.fill_(-math.log((1 - prior_prob) / prior_prob))
+            conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
+
+        for conv in self.obj_preds:
+            b = conv.bias.view(self.n_anchors, -1)
+            b.data.fill_(-math.log((1 - prior_prob) / prior_prob))
+            conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
+
+    def forward(self, xin, labels=None, imgs=None):
+        outputs = self.det_head(xin)
+        for o in outputs:
+            print_shape(o)
+        if self.training:
+            losses = self.compute_loss(outputs, labels)
+            return losses
+        else:
+            self.hw = [x.shape[-2:] for x in outputs]
+            # [batch, n_anchors_all, 85]
+            outputs = torch.cat(
+                [x.flatten(start_dim=2) for x in outputs], dim=2
+            ).permute(0, 2, 1)
+            if self.decode_in_inference:
+                return self.compute_loss.decode_outputs(outputs, dtype=xin[0].type())
+            else:
+                return outputs
+
+
+class ComputeLoss:
+    """Loss computation func.
+    This func contains SimOTA and siou loss.
+    """
+
+    def __init__(
+        self,
+        reg_weight=5.0,
+        iou_weight=3.0,
+        cls_weight=1.0,
+        center_radius=2.5,
+        eps=1e-7,
+        in_channels=[256, 512, 1024],
+        strides=[8, 16, 32],
+        n_anchors=1,
+        iou_type="ciou",
+    ):
+
+        self.reg_weight = reg_weight
+        self.iou_weight = iou_weight
+        self.cls_weight = cls_weight
+
+        self.center_radius = center_radius
+        self.eps = eps
+        self.n_anchors = n_anchors
+        self.strides = strides
+        self.grids = [torch.zeros(1)] * len(in_channels)
+
+        # Define criteria
+        self.l1_loss = nn.L1Loss(reduction="none")
+        self.bcewithlog_loss = nn.BCEWithLogitsLoss(reduction="none")
+        self.iou_loss = IOUlossV6(iou_type=iou_type, reduction="none")
+
+    def __call__(self, outputs, targets):
+        dtype = outputs[0].type()
+        device = targets.device
+        loss_cls, loss_obj, loss_iou, loss_l1 = (
+            torch.zeros(1, device=device),
+            torch.zeros(1, device=device),
+            torch.zeros(1, device=device),
+            torch.zeros(1, device=device),
+        )
+        num_classes = outputs[0].shape[-1] - 5
+
+        (
+            outputs,
+            outputs_origin,
+            gt_bboxes_scale,
+            xy_shifts,
+            expanded_strides,
+        ) = self.get_outputs_and_grids(outputs, self.strides, dtype, device)
+
+        total_num_anchors = outputs.shape[1]
+        bbox_preds = outputs[:, :, :4]  # [batch, n_anchors_all, 4]
+        bbox_preds_org = outputs_origin[:, :, :4]  # [batch, n_anchors_all, 4]
+        obj_preds = outputs[:, :, 4].unsqueeze(-1)  # [batch, n_anchors_all, 1]
+        cls_preds = outputs[:, :, 5:]  # [batch, n_anchors_all, n_cls]
+
+        # targets
+        batch_size = bbox_preds.shape[0]
+        # targets_list = np.zeros((batch_size, 1, 5)).tolist()
+        print(targets.shape)
+        print(targets)
+        # 14,100,5
+        # for i, item in enumerate(targets.cpu().numpy().tolist()):
+        #     targets_list[int(item[0])].append(item[1:])
+        # max_len = max((len(l) for l in targets_list))
+
+        # targets = torch.from_numpy(
+        #     np.array(
+        #         list(
+        #             map(
+        #                 lambda l: l + [[-1, 0, 0, 0, 0]] * (max_len - len(l)),
+        #                 targets_list,
+        #             )
+        #         )
+        #     )[:, 1:, :]
+        # ).to(targets.device)
+        num_targets_list = (targets.sum(dim=2) > 0).sum(dim=1)  # number of objects
+
+        num_fg, num_gts = 0, 0
+        cls_targets, reg_targets, l1_targets, obj_targets, fg_masks = [], [], [], [], []
+
+        for batch_idx in range(batch_size):
+            num_gt = int(num_targets_list[batch_idx])
+            num_gts += num_gt
+            if num_gt == 0:
+                cls_target = outputs.new_zeros((0, num_classes))
+                reg_target = outputs.new_zeros((0, 4))
+                l1_target = outputs.new_zeros((0, 4))
+                obj_target = outputs.new_zeros((total_num_anchors, 1))
+                fg_mask = outputs.new_zeros(total_num_anchors).bool()
+            else:
+
+                gt_bboxes_per_image = targets[batch_idx, :num_gt, 1:5].mul_(
+                    gt_bboxes_scale
+                )
+                gt_classes = targets[batch_idx, :num_gt, 0]
+                bboxes_preds_per_image = bbox_preds[batch_idx]
+                cls_preds_per_image = cls_preds[batch_idx]
+                obj_preds_per_image = obj_preds[batch_idx]
+
+                try:
+                    (
+                        gt_matched_classes,
+                        fg_mask,
+                        pred_ious_this_matching,
+                        matched_gt_inds,
+                        num_fg_img,
+                    ) = self.get_assignments(
+                        batch_idx,
+                        num_gt,
+                        total_num_anchors,
+                        gt_bboxes_per_image,
+                        gt_classes,
+                        bboxes_preds_per_image,
+                        cls_preds_per_image,
+                        obj_preds_per_image,
+                        expanded_strides,
+                        xy_shifts,
+                        num_classes,
+                    )
+
+                except RuntimeError:
+                    print(
+                        "OOM RuntimeError is raised due to the huge memory cost during label assignment. \
+                           CPU mode is applied in this batch. If you want to avoid this issue, \
+                           try to reduce the batch size or image size."
+                    )
+                    torch.cuda.empty_cache()
+                    print("------------CPU Mode for This Batch-------------")
+
+                    _gt_bboxes_per_image = gt_bboxes_per_image.cpu().float()
+                    _gt_classes = gt_classes.cpu().float()
+                    _bboxes_preds_per_image = bboxes_preds_per_image.cpu().float()
+                    _cls_preds_per_image = cls_preds_per_image.cpu().float()
+                    _obj_preds_per_image = obj_preds_per_image.cpu().float()
+
+                    _expanded_strides = expanded_strides.cpu().float()
+                    _xy_shifts = xy_shifts.cpu()
+
+                    (
+                        gt_matched_classes,
+                        fg_mask,
+                        pred_ious_this_matching,
+                        matched_gt_inds,
+                        num_fg_img,
+                    ) = self.get_assignments(
+                        batch_idx,
+                        num_gt,
+                        total_num_anchors,
+                        _gt_bboxes_per_image,
+                        _gt_classes,
+                        _bboxes_preds_per_image,
+                        _cls_preds_per_image,
+                        _obj_preds_per_image,
+                        _expanded_strides,
+                        _xy_shifts,
+                        num_classes,
+                    )
+
+                    gt_matched_classes = gt_matched_classes.cuda()
+                    fg_mask = fg_mask.cuda()
+                    pred_ious_this_matching = pred_ious_this_matching.cuda()
+                    matched_gt_inds = matched_gt_inds.cuda()
+
+                torch.cuda.empty_cache()
+                num_fg += num_fg_img
+                if num_fg_img > 0:
+                    cls_target = F.one_hot(
+                        gt_matched_classes.to(torch.int64), num_classes
+                    ) * pred_ious_this_matching.unsqueeze(-1)
+                    obj_target = fg_mask.unsqueeze(-1)
+                    reg_target = gt_bboxes_per_image[matched_gt_inds]
+
+                    l1_target = self.get_l1_target(
+                        outputs.new_zeros((num_fg_img, 4)),
+                        gt_bboxes_per_image[matched_gt_inds],
+                        expanded_strides[0][fg_mask],
+                        xy_shifts=xy_shifts[0][fg_mask],
+                    )
+
+            cls_targets.append(cls_target)
+            reg_targets.append(reg_target)
+            obj_targets.append(obj_target)
+            l1_targets.append(l1_target)
+            fg_masks.append(fg_mask)
+
+        cls_targets = torch.cat(cls_targets, 0)
+        reg_targets = torch.cat(reg_targets, 0)
+        obj_targets = torch.cat(obj_targets, 0)
+        l1_targets = torch.cat(l1_targets, 0)
+        fg_masks = torch.cat(fg_masks, 0)
+
+        num_fg = max(num_fg, 1)
+        # loss
+        loss_iou += (
+            self.iou_loss(bbox_preds.view(-1, 4)[fg_masks].T, reg_targets)
+        ).sum() / num_fg
+        loss_l1 += (
+            self.l1_loss(bbox_preds_org.view(-1, 4)[fg_masks], l1_targets)
+        ).sum() / num_fg
+
+        loss_obj += (
+            self.bcewithlog_loss(obj_preds.view(-1, 1), obj_targets * 1.0)
+        ).sum() / num_fg
+        loss_cls += (
+            self.bcewithlog_loss(cls_preds.view(-1, num_classes)[fg_masks], cls_targets)
+        ).sum() / num_fg
+
+        total_losses = self.reg_weight * loss_iou + loss_l1 + loss_obj + loss_cls
+        return (
+            total_losses,
+            torch.cat(
+                (self.reg_weight * loss_iou, loss_l1, loss_obj, loss_cls)
+            ).detach(),
+        )
+
+    def decode_output(self, output, k, stride, dtype, device):
+        grid = self.grids[k].to(device)
+        batch_size = output.shape[0]
+        hsize, wsize = output.shape[2:4]
+        if grid.shape[2:4] != output.shape[2:4]:
+            yv, xv = torch.meshgrid([torch.arange(hsize), torch.arange(wsize)])
+            grid = (
+                torch.stack((xv, yv), 2)
+                .view(1, 1, hsize, wsize, 2)
+                .type(dtype)
+                .to(device)
+            )
+            self.grids[k] = grid
+
+        output = output.reshape(batch_size, self.n_anchors * hsize * wsize, -1)
+        output_origin = output.clone()
+        grid = grid.view(1, -1, 2)
+
+        output[..., :2] = (output[..., :2] + grid) * stride
+        output[..., 2:4] = torch.exp(output[..., 2:4]) * stride
+
+        return output, output_origin, grid, hsize, wsize
+
+    def get_outputs_and_grids(self, outputs, strides, dtype, device):
+        xy_shifts = []
+        expanded_strides = []
+        outputs_new = []
+        outputs_origin = []
+
+        for k, output in enumerate(outputs):
+            output, output_origin, grid, feat_h, feat_w = self.decode_output(
+                output, k, strides[k], dtype, device
+            )
+
+            xy_shift = grid
+            expanded_stride = torch.full(
+                (1, grid.shape[1], 1), strides[k], dtype=grid.dtype, device=grid.device
+            )
+
+            xy_shifts.append(xy_shift)
+            expanded_strides.append(expanded_stride)
+            outputs_new.append(output)
+            outputs_origin.append(output_origin)
+
+        xy_shifts = torch.cat(xy_shifts, 1)  # [1, n_anchors_all, 2]
+        expanded_strides = torch.cat(expanded_strides, 1)  # [1, n_anchors_all, 1]
+        outputs_origin = torch.cat(outputs_origin, 1)
+        outputs = torch.cat(outputs_new, 1)
+
+        feat_h *= strides[-1]
+        feat_w *= strides[-1]
+        gt_bboxes_scale = torch.Tensor([[feat_w, feat_h, feat_w, feat_h]]).type_as(
+            outputs
+        )
+
+        return outputs, outputs_origin, gt_bboxes_scale, xy_shifts, expanded_strides
+
+    def get_l1_target(self, l1_target, gt, stride, xy_shifts, eps=1e-8):
+
+        l1_target[:, 0:2] = gt[:, 0:2] / stride - xy_shifts
+        l1_target[:, 2:4] = torch.log(gt[:, 2:4] / stride + eps)
+        return l1_target
+
+    @torch.no_grad()
+    def get_assignments(
+        self,
+        batch_idx,
+        num_gt,
+        total_num_anchors,
+        gt_bboxes_per_image,
+        gt_classes,
+        bboxes_preds_per_image,
+        cls_preds_per_image,
+        obj_preds_per_image,
+        expanded_strides,
+        xy_shifts,
+        num_classes,
+    ):
+        fg_mask, is_in_boxes_and_center = self.get_in_boxes_info(
+            gt_bboxes_per_image,
+            expanded_strides,
+            xy_shifts,
+            total_num_anchors,
+            num_gt,
+        )
+
+        bboxes_preds_per_image = bboxes_preds_per_image[fg_mask]
+        cls_preds_ = cls_preds_per_image[fg_mask]
+        obj_preds_ = obj_preds_per_image[fg_mask]
+        num_in_boxes_anchor = bboxes_preds_per_image.shape[0]
+
+        # cost
+        pair_wise_ious = pairwise_bbox_iou(
+            gt_bboxes_per_image, bboxes_preds_per_image, box_format="xywh"
+        )
+        pair_wise_ious_loss = -torch.log(pair_wise_ious + 1e-8)
+
+        gt_cls_per_image = (
+            F.one_hot(gt_classes.to(torch.int64), num_classes)
+            .float()
+            .unsqueeze(1)
+            .repeat(1, num_in_boxes_anchor, 1)
+        )
+
+        with torch.cuda.amp.autocast(enabled=False):
+            cls_preds_ = cls_preds_.float().sigmoid_().unsqueeze(0).repeat(
+                num_gt, 1, 1
+            ) * obj_preds_.float().sigmoid_().unsqueeze(0).repeat(num_gt, 1, 1)
+            pair_wise_cls_loss = F.binary_cross_entropy(
+                cls_preds_.sqrt_(), gt_cls_per_image, reduction="none"
+            ).sum(-1)
+        del cls_preds_, obj_preds_
+
+        cost = (
+            self.cls_weight * pair_wise_cls_loss
+            + self.iou_weight * pair_wise_ious_loss
+            + 100000.0 * (~is_in_boxes_and_center)
+        )
+        print_shape(cost, pair_wise_ious, gt_classes)
+
+        (
+            num_fg,
+            gt_matched_classes,
+            pred_ious_this_matching,
+            matched_gt_inds,
+        ) = self.dynamic_k_matching(cost, pair_wise_ious, gt_classes, num_gt, fg_mask)
+
+        del pair_wise_cls_loss, cost, pair_wise_ious, pair_wise_ious_loss
+
+        return (
+            gt_matched_classes,
+            fg_mask,
+            pred_ious_this_matching,
+            matched_gt_inds,
+            num_fg,
+        )
+
+    def get_in_boxes_info(
+        self,
+        gt_bboxes_per_image,
+        expanded_strides,
+        xy_shifts,
+        total_num_anchors,
+        num_gt,
+    ):
+        expanded_strides_per_image = expanded_strides[0]
+        xy_shifts_per_image = xy_shifts[0] * expanded_strides_per_image
+        xy_centers_per_image = (
+            (xy_shifts_per_image + 0.5 * expanded_strides_per_image)
+            .unsqueeze(0)
+            .repeat(num_gt, 1, 1)
+        )  # [n_anchor, 2] -> [n_gt, n_anchor, 2]
+
+        gt_bboxes_per_image_lt = (
+            (gt_bboxes_per_image[:, 0:2] - 0.5 * gt_bboxes_per_image[:, 2:4])
+            .unsqueeze(1)
+            .repeat(1, total_num_anchors, 1)
+        )
+        gt_bboxes_per_image_rb = (
+            (gt_bboxes_per_image[:, 0:2] + 0.5 * gt_bboxes_per_image[:, 2:4])
+            .unsqueeze(1)
+            .repeat(1, total_num_anchors, 1)
+        )  # [n_gt, 2] -> [n_gt, n_anchor, 2]
+
+        b_lt = xy_centers_per_image - gt_bboxes_per_image_lt
+        b_rb = gt_bboxes_per_image_rb - xy_centers_per_image
+        bbox_deltas = torch.cat([b_lt, b_rb], 2)
+
+        is_in_boxes = bbox_deltas.min(dim=-1).values > 0.0
+        is_in_boxes_all = is_in_boxes.sum(dim=0) > 0
+
+        # in fixed center
+        gt_bboxes_per_image_lt = (gt_bboxes_per_image[:, 0:2]).unsqueeze(1).repeat(
+            1, total_num_anchors, 1
+        ) - self.center_radius * expanded_strides_per_image.unsqueeze(0)
+        gt_bboxes_per_image_rb = (gt_bboxes_per_image[:, 0:2]).unsqueeze(1).repeat(
+            1, total_num_anchors, 1
+        ) + self.center_radius * expanded_strides_per_image.unsqueeze(0)
+
+        c_lt = xy_centers_per_image - gt_bboxes_per_image_lt
+        c_rb = gt_bboxes_per_image_rb - xy_centers_per_image
+        center_deltas = torch.cat([c_lt, c_rb], 2)
+        is_in_centers = center_deltas.min(dim=-1).values > 0.0
+        is_in_centers_all = is_in_centers.sum(dim=0) > 0
+
+        # in boxes and in centers
+        is_in_boxes_anchor = is_in_boxes_all | is_in_centers_all
+
+        is_in_boxes_and_center = (
+            is_in_boxes[:, is_in_boxes_anchor] & is_in_centers[:, is_in_boxes_anchor]
+        )
+        return is_in_boxes_anchor, is_in_boxes_and_center
+
+    def dynamic_k_matching(self, cost, pair_wise_ious, gt_classes, num_gt, fg_mask):
+        matching_matrix = torch.zeros_like(cost, dtype=torch.uint8)
+        ious_in_boxes_matrix = pair_wise_ious
+        n_candidate_k = min(10, ious_in_boxes_matrix.size(1))
+        topk_ious, _ = torch.topk(ious_in_boxes_matrix, n_candidate_k, dim=1)
+        dynamic_ks = torch.clamp(topk_ious.sum(1).int(), min=1)
+        dynamic_ks = dynamic_ks.tolist()
+
+        for gt_idx in range(num_gt):
+            _, pos_idx = torch.topk(cost[gt_idx], k=dynamic_ks[gt_idx], largest=False)
+            matching_matrix[gt_idx][pos_idx] = 1
+        del topk_ious, dynamic_ks, pos_idx
+
+        anchor_matching_gt = matching_matrix.sum(0)
+        if (anchor_matching_gt > 1).sum() > 0:
+            _, cost_argmin = torch.min(cost[:, anchor_matching_gt > 1], dim=0)
+            matching_matrix[:, anchor_matching_gt > 1] *= 0
+            matching_matrix[cost_argmin, anchor_matching_gt > 1] = 1
+        fg_mask_inboxes = matching_matrix.sum(0) > 0
+        num_fg = fg_mask_inboxes.sum().item()
+        fg_mask[fg_mask.clone()] = fg_mask_inboxes
+        matched_gt_inds = matching_matrix[:, fg_mask_inboxes].argmax(0)
+        gt_matched_classes = gt_classes[matched_gt_inds]
+
+        pred_ious_this_matching = (matching_matrix * pair_wise_ious).sum(0)[
+            fg_mask_inboxes
+        ]
+
+        return num_fg, gt_matched_classes, pred_ious_this_matching, matched_gt_inds
diff --git a/yolov7/modeling/head/yolox_head.py b/yolov7/modeling/head/yolox_head.py
old mode 100755
new mode 100644
diff --git a/yolov7/modeling/head/yolox_kpts_head.py b/yolov7/modeling/head/yolox_kpts_head.py
new file mode 100644
index 0000000..387d1b4
--- /dev/null
+++ b/yolov7/modeling/head/yolox_kpts_head.py
@@ -0,0 +1,741 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+
+import math
+from loguru import logger
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from yolov7.utils.boxes import bboxes_iou, IOUloss
+from ..backbone.layers.wrappers import BaseConv, DWConv
+import math
+
+
+'''
+Add Keypoints Head to YOLOX, this is single-stage keypoints detector
+still working in progress
+
+We predicting keypoints in a regression way, rather than HeatMaps
+'''
+
+
+class YOLOXHeadKPTS(nn.Module):
+    def __init__(
+        self,
+        num_classes,
+        width=1.0,
+        strides=[8, 16, 32],
+        in_channels=[256, 512, 1024],
+        act="silu",
+        depthwise=False,
+        num_kpts = 17,
+    ):
+        """
+        Args:
+            act (str): activation type of conv. Defalut value: "silu".
+            depthwise (bool): whether apply depthwise conv in conv branch. Defalut value: False.
+        """
+        super().__init__()
+
+        self.n_anchors = 1
+        self.num_classes = num_classes
+        self.num_kpts = num_kpts
+        self.decode_in_inference = True  # for deploy, set to False
+
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        self.kpts_convs = nn.ModuleList()
+        self.cls_preds = nn.ModuleList()
+        self.reg_preds = nn.ModuleList()
+        self.obj_preds = nn.ModuleList()
+        self.kpts_preds = nn.ModuleList()
+        self.stems = nn.ModuleList()
+        self.sigmas = torch.tensor([.26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89]) / 10.0
+        Conv = DWConv if depthwise else BaseConv
+
+        for i in range(len(in_channels)):
+            self.stems.append(
+                BaseConv(
+                    in_channels=int(in_channels[i] * width),
+                    out_channels=int(256 * width),
+                    ksize=1,
+                    stride=1,
+                    act=act,
+                )
+            )
+            self.cls_convs.append(
+                nn.Sequential(
+                    *[
+                        Conv(
+                            in_channels=int(256 * width),
+                            out_channels=int(256 * width),
+                            ksize=3,
+                            stride=1,
+                            act=act,
+                        ),
+                        Conv(
+                            in_channels=int(256 * width),
+                            out_channels=int(256 * width),
+                            ksize=3,
+                            stride=1,
+                            act=act,
+                        ),
+                    ]
+                )
+            )
+            self.reg_convs.append(
+                nn.Sequential(
+                    *[
+                        Conv(
+                            in_channels=int(256 * width),
+                            out_channels=int(256 * width),
+                            ksize=3,
+                            stride=1,
+                            act=act,
+                        ),
+                        Conv(
+                            in_channels=int(256 * width),
+                            out_channels=int(256 * width),
+                            ksize=3,
+                            stride=1,
+                            act=act,
+                        ),
+                    ]
+                )
+            )
+            self.kpts_convs.append(
+                nn.Sequential(
+                    *[
+                        Conv(
+                            in_channels=int(256 * width),
+                            out_channels=int(256 * width),
+                            ksize=3,
+                            stride=1,
+                            act=act,
+                        ),
+                        Conv(
+                            in_channels=int(256 * width),
+                            out_channels=int(256 * width),
+                            ksize=3,
+                            stride=1,
+                            act=act,
+                        ),
+                    ]
+                )
+            )
+            self.cls_preds.append(
+                nn.Conv2d(
+                    in_channels=int(256 * width),
+                    out_channels=self.n_anchors * self.num_classes,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                )
+            )
+            self.reg_preds.append(
+                nn.Conv2d(
+                    in_channels=int(256 * width),
+                    out_channels=4,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                )
+            )
+            self.obj_preds.append(
+                nn.Conv2d(
+                    in_channels=int(256 * width),
+                    out_channels=self.n_anchors * 1,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                )
+            )
+            self.kpts_preds.append(
+                nn.Conv2d(
+                    in_channels=int(256 * width),
+                    out_channels=self.n_anchors * self.num_kpts * 3,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                )
+            )
+
+        self.use_l1 = False
+        self.l1_loss = nn.L1Loss(reduction="none")
+        self.bcewithlog_loss = nn.BCEWithLogitsLoss(reduction="none")
+        self.iou_loss = IOUloss(reduction="none")
+        self.strides = strides
+        self.grids = [torch.zeros(1)] * len(in_channels)
+
+    def initialize_biases(self, prior_prob):
+        for conv in self.cls_preds:
+            b = conv.bias.view(self.n_anchors, -1)
+            b.data.fill_(-math.log((1 - prior_prob) / prior_prob))
+            conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
+
+        for conv in self.obj_preds:
+            b = conv.bias.view(self.n_anchors, -1)
+            b.data.fill_(-math.log((1 - prior_prob) / prior_prob))
+            conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
+
+    def forward(self, xin, labels=None, imgs=None):
+        outputs = []
+        origin_preds = []
+        x_shifts = []
+        y_shifts = []
+        expanded_strides = []
+
+        for k, (cls_conv, reg_conv, kpts_conv, stride_this_level, x) in enumerate(
+            zip(self.cls_convs, self.reg_convs, self.kpts_convs, self.strides, xin)
+        ):
+            x = self.stems[k](x)
+            cls_x = x
+            reg_x = x
+            kpts_x = x
+
+            cls_feat = cls_conv(cls_x)
+            cls_output = self.cls_preds[k](cls_feat)
+
+            reg_feat = reg_conv(reg_x)
+            reg_output = self.reg_preds[k](reg_feat)
+            obj_output = self.obj_preds[k](reg_feat)
+
+            kpts_feat = kpts_conv(kpts_x)
+            kpts_output = self.kpts_preds[k](kpts_feat)
+
+            if self.training:
+                output = torch.cat([reg_output, obj_output, cls_output, kpts_output], 1)
+                output, grid = self.get_output_and_grid(
+                    output, k, stride_this_level, xin[0].type()
+                )
+                x_shifts.append(grid[:, :, 0])
+                y_shifts.append(grid[:, :, 1])
+                expanded_strides.append(
+                    torch.zeros(1, grid.shape[1])
+                    .fill_(stride_this_level)
+                    .type_as(xin[0])
+                )
+                if self.use_l1:
+                    batch_size = reg_output.shape[0]
+                    hsize, wsize = reg_output.shape[-2:]
+                    reg_output = reg_output.view(
+                        batch_size, self.n_anchors, 4, hsize, wsize
+                    )
+                    reg_output = reg_output.permute(0, 1, 3, 4, 2).reshape(
+                        batch_size, -1, 4
+                    )
+                    origin_preds.append(reg_output.clone())
+
+            else:
+                output = torch.cat(
+                    [reg_output, obj_output.sigmoid(), cls_output.sigmoid(), kpts_output], 1
+                )
+
+            outputs.append(output)
+
+        if self.training:
+            return self.get_losses(
+                imgs,
+                x_shifts,
+                y_shifts,
+                expanded_strides,
+                labels,
+                torch.cat(outputs, 1),
+                origin_preds,
+                dtype=xin[0].dtype,
+            )
+        else:
+            self.hw = [x.shape[-2:] for x in outputs]
+            # [batch, n_anchors_all, 85]
+            outputs = torch.cat(
+                [x.flatten(start_dim=2) for x in outputs], dim=2
+            ).permute(0, 2, 1)
+            if self.decode_in_inference:
+                return self.decode_outputs(outputs, dtype=xin[0].type())
+            else:
+                return outputs
+
+    def get_output_and_grid(self, output, k, stride, dtype):
+        grid = self.grids[k]
+
+        batch_size = output.shape[0]
+        n_ch = 5 + self.num_classes+ 3*self.num_kpts
+        hsize, wsize = output.shape[-2:]
+        if grid.shape[2:4] != output.shape[2:4]:
+            yv, xv = torch.meshgrid([torch.arange(hsize), torch.arange(wsize)])
+            grid = torch.stack((xv, yv), 2).view(1, 1, hsize, wsize, 2).type(dtype)
+            self.grids[k] = grid
+
+        output = output.view(batch_size, self.n_anchors, n_ch, hsize, wsize)
+        output = output.permute(0, 1, 3, 4, 2).reshape(
+            batch_size, self.n_anchors * hsize * wsize, -1
+        )
+        grid = grid.view(1, -1, 2)
+        kpt_conf_grids = torch.zeros_like(grid)[...,0:1]
+        kpt_grids = torch.cat((grid, kpt_conf_grids), dim=2)
+
+        output[..., :2] = (output[..., :2] + grid) * stride
+        output[..., 2:4] = torch.exp(output[..., 2:4]) * stride
+        output[..., 6:] = (output[..., 6:] +  kpt_grids.repeat(1,1,self.num_kpts)) * stride
+        return output, grid
+
+    def decode_outputs(self, outputs, dtype):
+        grids = []
+        strides = []
+        for (hsize, wsize), stride in zip(self.hw, self.strides):
+            yv, xv = torch.meshgrid([torch.arange(hsize), torch.arange(wsize)])
+            grid = torch.stack((xv, yv), 2).view(1, -1, 2)
+            grids.append(grid)
+            shape = grid.shape[:2]
+            strides.append(torch.full((*shape, 1), stride))
+
+        grids = torch.cat(grids, dim=1).type(dtype)
+        kpt_conf_grids = torch.zeros_like(grids)[...,0:1]
+        kpt_grids = torch.cat((grids, kpt_conf_grids), dim=2)
+        strides = torch.cat(strides, dim=1).type(dtype)
+
+        if self.onnx_export:
+            xy, wh, conf, prob, kpts = torch.split(outputs, [2, 2, 1, self.num_classes, 3*self.num_kpts], dim=2)
+            xy = (xy + grids)*strides
+            wh = torch.exp(wh)*strides
+            idxs = torch.argmax(prob, dim=-1).unsqueeze(axis=-1).type(xy.dtype)
+            kpts = (kpts + kpt_grids.repeat(1,1,self.num_kpts)) * strides
+            outputs = torch.cat((xy, wh, conf, idxs, prob, kpts), dim=2)
+            # TODO: append keypoints when onnx export
+        else:
+            outputs[..., :2] = (outputs[..., :2] + grids) * strides
+            outputs[..., 2:4] = torch.exp(outputs[..., 2:4]) * strides        
+            outputs[...,  6:] = (outputs[..., 6:] + kpt_grids.repeat(1,1,self.num_kpts)) * strides
+        return outputs
+
+    def get_losses(
+        self,
+        imgs,
+        x_shifts,
+        y_shifts,
+        expanded_strides,
+        labels,
+        outputs,
+        origin_preds,
+        dtype,
+    ):
+        bbox_preds = outputs[:, :, :4]  # [batch, n_anchors_all, 4]
+        obj_preds = outputs[:, :, 4].unsqueeze(-1)  # [batch, n_anchors_all, 1]
+        cls_preds = outputs[:, :, 5 : 5+self.num_classes]  # [batch, n_anchors_all, n_cls]
+        kpts_preds = outputs[:, :, 5+self.num_classes:]
+
+        # calculate targets
+        mixup = labels.shape[2] > 5
+        if mixup:
+            label_cut = labels[..., :5]
+        else:
+            label_cut = labels
+        nlabel = (label_cut.sum(dim=2) > 0).sum(dim=1)  # number of objects
+
+        total_num_anchors = outputs.shape[1]
+        x_shifts = torch.cat(x_shifts, 1)  # [1, n_anchors_all]
+        y_shifts = torch.cat(y_shifts, 1)  # [1, n_anchors_all]
+        expanded_strides = torch.cat(expanded_strides, 1)
+        if self.use_l1:
+            origin_preds = torch.cat(origin_preds, 1)
+
+        cls_targets = []
+        reg_targets = []
+        l1_targets = []
+        obj_targets = []
+        kpts_targets = []
+        fg_masks = []
+
+        num_fg = 0.0
+        num_gts = 0.0
+
+        for batch_idx in range(outputs.shape[0]):
+            num_gt = int(nlabel[batch_idx])
+            num_gts += num_gt
+            if num_gt == 0:
+                cls_target = outputs.new_zeros((0, self.num_classes))
+                reg_target = outputs.new_zeros((0, 4))
+                kpts_target = outputs.new_zeros((0, 2*self.num_kpts))
+                l1_target = outputs.new_zeros((0, 4))
+                obj_target = outputs.new_zeros((total_num_anchors, 1))
+                fg_mask = outputs.new_zeros(total_num_anchors).bool()
+            else:
+                gt_bboxes_per_image = labels[batch_idx, :num_gt, 1:5]
+                kpts_per_image = labels[batch_idx, :num_gt, 5:]
+                gt_classes = labels[batch_idx, :num_gt, 0]
+                bboxes_preds_per_image = bbox_preds[batch_idx]
+                kpts_preds_per_image = kpts_preds[batch_idx]
+
+                try:
+                    (
+                        gt_matched_classes,
+                        fg_mask,
+                        pred_ious_this_matching,
+                        matched_gt_inds,
+                        num_fg_img,
+                    ) = self.get_assignments(  # noqa
+                        batch_idx,
+                        num_gt,
+                        total_num_anchors,
+                        gt_bboxes_per_image,
+                        gt_classes,
+                        bboxes_preds_per_image,
+                        expanded_strides,
+                        x_shifts,
+                        y_shifts,
+                        cls_preds,
+                        bbox_preds,
+                        obj_preds,
+                        labels,
+                        imgs,
+                    )
+                except RuntimeError:
+                    logger.error(
+                        "OOM RuntimeError is raised due to the huge memory cost during label assignment. \
+                           CPU mode is applied in this batch. If you want to avoid this issue, \
+                           try to reduce the batch size or image size."
+                    )
+                    torch.cuda.empty_cache()
+                    (
+                        gt_matched_classes,
+                        fg_mask,
+                        pred_ious_this_matching,
+                        matched_gt_inds,
+                        num_fg_img,
+                    ) = self.get_assignments(  # noqa
+                        batch_idx,
+                        num_gt,
+                        total_num_anchors,
+                        gt_bboxes_per_image,
+                        gt_classes,
+                        bboxes_preds_per_image,
+                        expanded_strides,
+                        x_shifts,
+                        y_shifts,
+                        cls_preds,
+                        bbox_preds,
+                        obj_preds,
+                        labels,
+                        imgs,
+                        "cpu",
+                    )
+
+                torch.cuda.empty_cache()
+                num_fg += num_fg_img
+
+                cls_target = F.one_hot(
+                    gt_matched_classes.to(torch.int64), self.num_classes
+                ) * pred_ious_this_matching.unsqueeze(-1)
+                obj_target = fg_mask.unsqueeze(-1)
+                reg_target = gt_bboxes_per_image[matched_gt_inds]
+                kpts_target = kpts_per_image[matched_gt_inds]
+                if self.use_l1:
+                    l1_target = self.get_l1_target(
+                        outputs.new_zeros((num_fg_img, 4)),
+                        gt_bboxes_per_image[matched_gt_inds],
+                        gt_bboxes_per_image[matched_gt_inds],
+                        expanded_strides[0][fg_mask],
+                        x_shifts=x_shifts[0][fg_mask],
+                        y_shifts=y_shifts[0][fg_mask],
+                    )
+
+            cls_targets.append(cls_target)
+            reg_targets.append(reg_target)
+            kpts_targets.append(kpts_target)
+            obj_targets.append(obj_target.to(dtype))
+            fg_masks.append(fg_mask)
+            if self.use_l1:
+                l1_targets.append(l1_target)
+
+        cls_targets = torch.cat(cls_targets, 0)
+        reg_targets = torch.cat(reg_targets, 0)
+        kpts_targets = torch.cat(kpts_targets, 0)
+        obj_targets = torch.cat(obj_targets, 0)
+        fg_masks = torch.cat(fg_masks, 0)
+        if self.use_l1:
+            l1_targets = torch.cat(l1_targets, 0)
+
+        num_fg = max(num_fg, 1)
+        loss_iou = (
+            self.iou_loss(bbox_preds.view(-1, 4)[fg_masks], reg_targets)
+        ).sum() / num_fg
+        loss_obj = (
+            self.bcewithlog_loss(obj_preds.view(-1, 1), obj_targets)
+        ).sum() / num_fg
+        loss_cls = (
+            self.bcewithlog_loss(
+                cls_preds.view(-1, self.num_classes)[fg_masks], cls_targets
+            )
+        ).sum() / num_fg
+        loss_kpts, loss_kpts_vis = self.kpts_loss(
+                kpts_preds.view(-1, self.num_kpts*3)[fg_masks], kpts_targets, reg_targets)
+        loss_kpts = loss_kpts.sum() / num_fg
+        loss_kpts_vis = loss_kpts_vis.sum() / num_fg
+
+        if self.use_l1:
+            loss_l1 = (
+                self.l1_loss(origin_preds.view(-1, 4)[fg_masks], l1_targets)
+            ).sum() / num_fg
+        else:
+            loss_l1 = 0.0
+
+        reg_weight = 5.0
+        loss = reg_weight * loss_iou + loss_obj + loss_cls + loss_l1 + reg_weight * loss_kpts + loss_kpts_vis
+
+        return (
+            loss,
+            reg_weight * loss_iou,
+            loss_obj,
+            loss_cls,
+            loss_l1,
+            reg_weight * loss_kpts,
+            loss_kpts_vis,
+            num_fg / max(num_gts, 1),
+        )
+
+    def get_l1_target(self, l1_target, gt, stride, x_shifts, y_shifts, eps=1e-8):
+        l1_target[:, 0] = gt[:, 0] / stride - x_shifts
+        l1_target[:, 1] = gt[:, 1] / stride - y_shifts
+        l1_target[:, 2] = torch.log(gt[:, 2] / stride + eps)
+        l1_target[:, 3] = torch.log(gt[:, 3] / stride + eps)
+        return l1_target
+
+    @torch.no_grad()
+    def get_assignments(
+        self,
+        batch_idx,
+        num_gt,
+        total_num_anchors,
+        gt_bboxes_per_image,
+        gt_classes,
+        bboxes_preds_per_image,
+        expanded_strides,
+        x_shifts,
+        y_shifts,
+        cls_preds,
+        bbox_preds,
+        obj_preds,
+        labels,
+        imgs,
+        mode="gpu",
+    ):
+
+        if mode == "cpu":
+            print("------------CPU Mode for This Batch-------------")
+            gt_bboxes_per_image = gt_bboxes_per_image.cpu().float()
+            bboxes_preds_per_image = bboxes_preds_per_image.cpu().float()
+            gt_classes = gt_classes.cpu().float()
+            expanded_strides = expanded_strides.cpu().float()
+            x_shifts = x_shifts.cpu()
+            y_shifts = y_shifts.cpu()
+
+        fg_mask, is_in_boxes_and_center = self.get_in_boxes_info(
+            gt_bboxes_per_image,
+            expanded_strides,
+            x_shifts,
+            y_shifts,
+            total_num_anchors,
+            num_gt,
+        )
+
+        bboxes_preds_per_image = bboxes_preds_per_image[fg_mask]
+        cls_preds_ = cls_preds[batch_idx][fg_mask]
+        obj_preds_ = obj_preds[batch_idx][fg_mask]
+        num_in_boxes_anchor = bboxes_preds_per_image.shape[0]
+
+        if mode == "cpu":
+            gt_bboxes_per_image = gt_bboxes_per_image.cpu()
+            bboxes_preds_per_image = bboxes_preds_per_image.cpu()
+
+        pair_wise_ious = bboxes_iou(gt_bboxes_per_image, bboxes_preds_per_image, False)
+
+        gt_cls_per_image = (
+            F.one_hot(gt_classes.to(torch.int64), self.num_classes)
+            .float()
+            .unsqueeze(1)
+            .repeat(1, num_in_boxes_anchor, 1)
+        )
+        pair_wise_ious_loss = -torch.log(pair_wise_ious + 1e-8)
+
+        if mode == "cpu":
+            cls_preds_, obj_preds_ = cls_preds_.cpu(), obj_preds_.cpu()
+
+        with torch.cuda.amp.autocast(enabled=False):
+            cls_preds_ = (
+                cls_preds_.float().unsqueeze(0).repeat(num_gt, 1, 1).sigmoid_()
+                * obj_preds_.float().unsqueeze(0).repeat(num_gt, 1, 1).sigmoid_()
+            )
+            pair_wise_cls_loss = F.binary_cross_entropy(
+                cls_preds_.sqrt_(), gt_cls_per_image, reduction="none"
+            ).sum(-1)
+        del cls_preds_
+
+        cost = (
+            pair_wise_cls_loss
+            + 3.0 * pair_wise_ious_loss
+            + 100000.0 * (~is_in_boxes_and_center)
+        )
+
+        (
+            num_fg,
+            gt_matched_classes,
+            pred_ious_this_matching,
+            matched_gt_inds,
+        ) = self.dynamic_k_matching(cost, pair_wise_ious, gt_classes, num_gt, fg_mask)
+        del pair_wise_cls_loss, cost, pair_wise_ious, pair_wise_ious_loss
+
+        if mode == "cpu":
+            gt_matched_classes = gt_matched_classes.cuda()
+            fg_mask = fg_mask.cuda()
+            pred_ious_this_matching = pred_ious_this_matching.cuda()
+            matched_gt_inds = matched_gt_inds.cuda()
+
+        return (
+            gt_matched_classes,
+            fg_mask,
+            pred_ious_this_matching,
+            matched_gt_inds,
+            num_fg,
+        )
+
+    def get_in_boxes_info(
+        self,
+        gt_bboxes_per_image,
+        expanded_strides,
+        x_shifts,
+        y_shifts,
+        total_num_anchors,
+        num_gt,
+    ):
+        expanded_strides_per_image = expanded_strides[0]
+        x_shifts_per_image = x_shifts[0] * expanded_strides_per_image
+        y_shifts_per_image = y_shifts[0] * expanded_strides_per_image
+        x_centers_per_image = (
+            (x_shifts_per_image + 0.5 * expanded_strides_per_image)
+            .unsqueeze(0)
+            .repeat(num_gt, 1)
+        )  # [n_anchor] -> [n_gt, n_anchor]
+        y_centers_per_image = (
+            (y_shifts_per_image + 0.5 * expanded_strides_per_image)
+            .unsqueeze(0)
+            .repeat(num_gt, 1)
+        )
+
+        gt_bboxes_per_image_l = (
+            (gt_bboxes_per_image[:, 0] - 0.5 * gt_bboxes_per_image[:, 2])
+            .unsqueeze(1)
+            .repeat(1, total_num_anchors)
+        )
+        gt_bboxes_per_image_r = (
+            (gt_bboxes_per_image[:, 0] + 0.5 * gt_bboxes_per_image[:, 2])
+            .unsqueeze(1)
+            .repeat(1, total_num_anchors)
+        )
+        gt_bboxes_per_image_t = (
+            (gt_bboxes_per_image[:, 1] - 0.5 * gt_bboxes_per_image[:, 3])
+            .unsqueeze(1)
+            .repeat(1, total_num_anchors)
+        )
+        gt_bboxes_per_image_b = (
+            (gt_bboxes_per_image[:, 1] + 0.5 * gt_bboxes_per_image[:, 3])
+            .unsqueeze(1)
+            .repeat(1, total_num_anchors)
+        )
+
+        b_l = x_centers_per_image - gt_bboxes_per_image_l
+        b_r = gt_bboxes_per_image_r - x_centers_per_image
+        b_t = y_centers_per_image - gt_bboxes_per_image_t
+        b_b = gt_bboxes_per_image_b - y_centers_per_image
+        bbox_deltas = torch.stack([b_l, b_t, b_r, b_b], 2)
+
+        is_in_boxes = bbox_deltas.min(dim=-1).values > 0.0
+        is_in_boxes_all = is_in_boxes.sum(dim=0) > 0
+        # in fixed center
+
+        center_radius = 2.5
+
+        gt_bboxes_per_image_l = (gt_bboxes_per_image[:, 0]).unsqueeze(1).repeat(
+            1, total_num_anchors
+        ) - center_radius * expanded_strides_per_image.unsqueeze(0)
+        gt_bboxes_per_image_r = (gt_bboxes_per_image[:, 0]).unsqueeze(1).repeat(
+            1, total_num_anchors
+        ) + center_radius * expanded_strides_per_image.unsqueeze(0)
+        gt_bboxes_per_image_t = (gt_bboxes_per_image[:, 1]).unsqueeze(1).repeat(
+            1, total_num_anchors
+        ) - center_radius * expanded_strides_per_image.unsqueeze(0)
+        gt_bboxes_per_image_b = (gt_bboxes_per_image[:, 1]).unsqueeze(1).repeat(
+            1, total_num_anchors
+        ) + center_radius * expanded_strides_per_image.unsqueeze(0)
+
+        c_l = x_centers_per_image - gt_bboxes_per_image_l
+        c_r = gt_bboxes_per_image_r - x_centers_per_image
+        c_t = y_centers_per_image - gt_bboxes_per_image_t
+        c_b = gt_bboxes_per_image_b - y_centers_per_image
+        center_deltas = torch.stack([c_l, c_t, c_r, c_b], 2)
+        is_in_centers = center_deltas.min(dim=-1).values > 0.0
+        is_in_centers_all = is_in_centers.sum(dim=0) > 0
+
+        # in boxes and in centers
+        is_in_boxes_anchor = is_in_boxes_all | is_in_centers_all
+
+        is_in_boxes_and_center = (
+            is_in_boxes[:, is_in_boxes_anchor] & is_in_centers[:, is_in_boxes_anchor]
+        )
+        return is_in_boxes_anchor, is_in_boxes_and_center
+
+    def dynamic_k_matching(self, cost, pair_wise_ious, gt_classes, num_gt, fg_mask):
+        # Dynamic K
+        # ---------------------------------------------------------------
+        matching_matrix = torch.zeros_like(cost)
+
+        ious_in_boxes_matrix = pair_wise_ious
+        n_candidate_k = min(10, ious_in_boxes_matrix.size(1))
+        topk_ious, _ = torch.topk(ious_in_boxes_matrix, n_candidate_k, dim=1)
+        dynamic_ks = torch.clamp(topk_ious.sum(1).int(), min=1)
+        for gt_idx in range(num_gt):
+            _, pos_idx = torch.topk(
+                cost[gt_idx], k=dynamic_ks[gt_idx].item(), largest=False
+            )
+            matching_matrix[gt_idx][pos_idx] = 1.0
+
+        del topk_ious, dynamic_ks, pos_idx
+
+        anchor_matching_gt = matching_matrix.sum(0)
+        if (anchor_matching_gt > 1).sum() > 0:
+            _, cost_argmin = torch.min(cost[:, anchor_matching_gt > 1], dim=0)
+            matching_matrix[:, anchor_matching_gt > 1] *= 0.0
+            matching_matrix[cost_argmin, anchor_matching_gt > 1] = 1.0
+        fg_mask_inboxes = matching_matrix.sum(0) > 0.0
+        num_fg = fg_mask_inboxes.sum().item()
+
+        fg_mask[fg_mask.clone()] = fg_mask_inboxes
+
+        matched_gt_inds = matching_matrix[:, fg_mask_inboxes].argmax(0)
+        gt_matched_classes = gt_classes[matched_gt_inds]
+
+        pred_ious_this_matching = (matching_matrix * pair_wise_ious).sum(0)[
+            fg_mask_inboxes
+        ]
+        return num_fg, gt_matched_classes, pred_ious_this_matching, matched_gt_inds
+
+
+    def kpts_loss(self, kpts_preds, kpts_targets, bbox_targets):
+        sigmas = torch.tensor([.26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89], device=kpts_preds.device) / 10.0
+        kpts_preds_x, kpts_targets_x = kpts_preds[:, 0::3], kpts_targets[:, 0::2]
+        kpts_preds_y, kpts_targets_y = kpts_preds[:, 1::3], kpts_targets[:, 1::2]
+        kpts_preds_score = kpts_preds[:, 2::3]
+        # mask
+        kpt_mask = (kpts_targets[:, 0::2] != 0)
+        lkptv = self.bcewithlog_loss(kpts_preds_score, kpt_mask.float()).mean(axis=1)
+        # OKS based loss
+        d = (kpts_preds_x - kpts_targets_x) ** 2 + (kpts_preds_y - kpts_targets_y) ** 2
+        bbox_scale = torch.prod(bbox_targets[:, -2:], dim=1, keepdim=True)  #scale derived from bbox gt
+        kpt_loss_factor = (torch.sum(kpt_mask != 0) + torch.sum(kpt_mask == 0)) / torch.sum(kpt_mask != 0)
+        oks = torch.exp(-d / (bbox_scale * (4 * sigmas) + 1e-9))
+        lkpt = kpt_loss_factor * ((1 - oks**2) * kpt_mask).mean(axis=1)
+
+        return lkpt, lkptv
+
diff --git a/yolov7/modeling/loss/loss.py b/yolov7/modeling/loss/loss.py
old mode 100755
new mode 100644
diff --git a/yolov7/modeling/loss/seg.py b/yolov7/modeling/loss/seg.py
new file mode 100644
index 0000000..4b003f5
--- /dev/null
+++ b/yolov7/modeling/loss/seg.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+This file provides the definition of the convolutional heads used to predict masks, as well as the losses
+"""
+import io
+from collections import defaultdict
+from typing import List, Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from PIL import Image
+from torch import Tensor
+
+
+
+def dice_loss(inputs, targets, num_boxes):
+    """
+    Compute the DICE loss, similar to generalized IOU for masks
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+    """
+    inputs = inputs.sigmoid()
+    inputs = inputs.flatten(1)
+    numerator = 2 * (inputs * targets).sum(1)
+    denominator = inputs.sum(-1) + targets.sum(-1)
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    return loss.sum() / num_boxes
+
+
+def sigmoid_focal_loss(
+    inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2
+):
+    """
+    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+        alpha: (optional) Weighting factor in range (0,1) to balance
+                positive vs negative examples. Default = -1 (no weighting).
+        gamma: Exponent of the modulating factor (1 - p_t) to
+               balance easy vs hard examples.
+    Returns:
+        Loss tensor
+    """
+    prob = inputs.sigmoid()
+    ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    p_t = prob * targets + (1 - prob) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+
+    return loss.mean(1).sum() / num_boxes
diff --git a/yolov7/modeling/loss/setcriterion.py b/yolov7/modeling/loss/setcriterion.py
new file mode 100644
index 0000000..06b6863
--- /dev/null
+++ b/yolov7/modeling/loss/setcriterion.py
@@ -0,0 +1,398 @@
+import copy
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+
+import yolov7.utils.boxes as box_ops
+from yolov7.utils.misc import nested_tensor_from_tensor_list, accuracy, interpolate, is_dist_avail_and_initialized, get_world_size
+from .seg import dice_loss, sigmoid_focal_loss
+
+
+def _reduce_num_boxes(targets, device):
+    # Compute the average number of target boxes accross all nodes, for normalization purposes
+    num_boxes = sum(len(t["labels"]) for t in targets)
+    num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=device)
+    if is_dist_avail_and_initialized():
+        torch.distributed.all_reduce(num_boxes)
+    num_boxes = torch.clamp(num_boxes / get_world_size(), min=1).item()
+    return num_boxes
+
+
+class SetCriterion(nn.Module):
+    """This class computes the loss for DETR.
+    The process happens in two steps:
+        1) we compute hungarian assignment between ground truth boxes and the outputs of the model
+        2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
+    """
+
+    def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses):
+        """Create the criterion.
+        Parameters:
+            num_classes: number of object categories, omitting the special no-object category
+            matcher: module able to compute a matching between targets and proposals
+            weight_dict: dict containing as key the names of the losses and as values their relative weight.
+            eos_coef: relative classification weight applied to the no-object category
+            losses: list of all the losses to be applied. See get_loss for list of available losses.
+        """
+        super().__init__()
+        self.num_classes = num_classes
+        self.matcher = matcher
+        self.weight_dict = weight_dict
+        self.eos_coef = eos_coef
+        self.losses = losses
+        empty_weight = torch.ones(self.num_classes + 1)
+        empty_weight[-1] = self.eos_coef
+        self.register_buffer("empty_weight", empty_weight)
+
+    def loss_labels(self, outputs, targets, indices, num_boxes, log=True):
+        """Classification loss (NLL)
+        targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
+        """
+        assert "pred_logits" in outputs
+        # shape (batch_size, num_queries, NUM_CLASS + 1)
+        src_logits = outputs["pred_logits"]
+        # idx = (batch_idx, src_idx)
+        #    batch_idx shape [\sum_b num_match_b]
+        #    src_idx shape [\sum_b num_match_b]
+        idx = self._get_src_permutation_idx(indices)
+        # targets: List[Dict[str, torch.Tensor]]. Keys
+        #   "labels": [NUM_BOX,]
+        #   "boxes": [NUM_BOX, 4]
+        # target_classes_o shape [batch_size * num_match]
+        target_classes_o = torch.cat(
+            [t["labels"][J] for t, (_, J) in zip(targets, indices)]
+        )
+        # shape (batch_size, num_queries)
+        target_classes = torch.full(
+            src_logits.shape[:2],
+            self.num_classes,
+            dtype=torch.int64,
+            device=src_logits.device,
+        )
+        target_classes[idx] = target_classes_o
+
+        loss_ce = F.cross_entropy(
+            src_logits.transpose(1, 2), target_classes, self.empty_weight
+        )
+        losses = {"loss_ce": loss_ce}
+
+        if log:
+            # TODO this should probably be a separate loss, not hacked in this one here
+            losses["class_error"] = 100 - \
+                accuracy(src_logits[idx], target_classes_o)[0]
+        return losses
+
+    def forground_background_loss_labels(
+        self, outputs, targets, indices, num_boxes, log=True
+    ):
+        assert "pred_logits" in outputs
+        # shape (batch_size, num_queries, 1)
+        src_logits = outputs["pred_logits"]
+
+        batch_size, num_queries = src_logits.shape[:2]
+
+        assert src_logits.shape[2] == 1, f"expect 1 class {src_logits.shape[2]}"
+        idx = self._get_src_permutation_idx(indices)
+
+        target_classes_o = torch.cat(
+            [t["labels"][J] for t, (_, J) in zip(targets, indices)]
+        )
+        target_classes = torch.full(
+            src_logits.shape[:2],
+            1,
+            dtype=torch.int64,
+            device=src_logits.device,
+        )
+        target_classes[idx] = target_classes_o
+
+        target_classes_onehot = torch.zeros(
+            [src_logits.shape[0], src_logits.shape[1], 2],
+            dtype=src_logits.dtype,
+            layout=src_logits.layout,
+            device=src_logits.device,
+        )
+        target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
+        target_classes_onehot = target_classes_onehot[:, :, :-1]
+
+        loss_ce = (
+            sigmoid_focal_loss(
+                src_logits,
+                target_classes_onehot,
+                num_boxes,
+                alpha=self.focal_alpha,
+                gamma=2,
+            )
+            * src_logits.shape[1]
+        )
+        return {"loss_ce": loss_ce}
+
+    @torch.no_grad()
+    def loss_cardinality(self, outputs, targets, indices, num_boxes):
+        """Compute the cardinality error, ie the absolute error in the number of predicted non-empty boxes
+        This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients
+        """
+        pred_logits = outputs["pred_logits"]
+        device = pred_logits.device
+        tgt_lengths = torch.as_tensor(
+            [len(v["labels"]) for v in targets], device=device
+        )
+        # Count the number of predictions that are NOT "no-object" (which is the last class)
+        card_pred = (pred_logits.argmax(-1) !=
+                     pred_logits.shape[-1] - 1).sum(1)
+        card_err = F.l1_loss(card_pred.float(), tgt_lengths.float())
+        losses = {"cardinality_error": card_err}
+        return losses
+
+    def loss_boxes(self, outputs, targets, indices, num_boxes):
+        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
+        targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
+        The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size.
+        """
+        assert "pred_boxes" in outputs
+        idx = self._get_src_permutation_idx(indices)
+        # shape [\sum_b num_matches_b, 4]
+        src_boxes = outputs["pred_boxes"][idx]
+        # shape [\sum_b num_matches_b, 4]
+        target_boxes = torch.cat(
+            [t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0
+        )
+
+        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction="none")
+
+        losses = {}
+        losses["loss_bbox"] = loss_bbox.sum() / num_boxes
+
+        loss_giou = 1 - torch.diag(
+            box_ops.generalized_box_iou(
+                box_ops.box_cxcywh_to_xyxy(src_boxes),
+                box_ops.box_cxcywh_to_xyxy(target_boxes),
+            )
+        )
+        losses["loss_giou"] = loss_giou.sum() / num_boxes
+        return losses
+
+    def loss_masks(self, outputs, targets, indices, num_boxes):
+        """Compute the losses related to the masks: the focal loss and the dice loss.
+        targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]
+        """
+        assert "pred_masks" in outputs
+
+        src_idx = self._get_src_permutation_idx(indices)
+        tgt_idx = self._get_tgt_permutation_idx(indices)
+        src_masks = outputs["pred_masks"]
+        src_masks = src_masks[src_idx]
+        masks = [t["masks"] for t in targets]
+        # TODO use valid to mask invalid areas due to padding in loss
+        target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
+        target_masks = target_masks.to(src_masks)
+        target_masks = target_masks[tgt_idx]
+
+        # upsample predictions to the target size
+        src_masks = interpolate(
+            src_masks[:, None],
+            size=target_masks.shape[-2:],
+            mode="bilinear",
+            align_corners=False,
+        )
+        src_masks = src_masks[:, 0].flatten(1)
+
+        target_masks = target_masks.flatten(1)
+        target_masks = target_masks.view(src_masks.shape)
+        losses = {
+            "loss_mask": sigmoid_focal_loss(src_masks, target_masks, num_boxes),
+            "loss_dice": dice_loss(src_masks, target_masks, num_boxes),
+        }
+        return losses
+
+    def _get_src_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat(
+            [torch.full_like(src, i) for i, (src, _) in enumerate(indices)]
+        )  # shape [\sum_b num_match_b]
+        # shape [\sum_b num_match_b]
+        src_idx = torch.cat([src for (src, _) in indices])
+        return batch_idx, src_idx
+
+    def _get_tgt_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat(
+            [torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)]
+        )  # shape [\sum_b num_match_b]
+        # shape [\sum_b num_match_b]
+        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
+        return batch_idx, tgt_idx
+
+    def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs):
+        loss_map = {
+            "labels": self.loss_labels,
+            "cardinality": self.loss_cardinality,
+            "boxes": self.loss_boxes,
+            "masks": self.loss_masks,
+        }
+        assert loss in loss_map, f"do you really want to compute {loss} loss?"
+        return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs)
+
+    def get_foreground_background_loss(
+        self, loss, outputs, targets, indices, num_boxes, **kwargs
+    ):
+        loss_map = {
+            "labels": self.forground_background_loss_labels,
+            "cardinality": self.loss_cardinality,
+            "boxes": self.loss_boxes,
+        }
+        assert loss in loss_map, f"do you really want to compute {loss} loss?"
+        return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs)
+
+    def _forward(self, outputs, outputs_without_aux, targets):
+
+        # Retrieve the matching between the outputs of the last layer and the targets
+        # A list where each item is [row_indices, col_indices]
+        indices = self.matcher(outputs_without_aux, targets)
+
+        num_boxes = _reduce_num_boxes(
+            targets, next(iter(outputs.values())).device)
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            losses.update(self.get_loss(
+                loss, outputs, targets, indices, num_boxes))
+
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if "aux_outputs" in outputs:
+            for i, aux_outputs in enumerate(outputs["aux_outputs"]):
+                indices = self.matcher(aux_outputs, targets)
+                for loss in self.losses:
+                    if loss == "masks":
+                        # Intermediate masks losses are too costly to compute, we ignore them.
+                        continue
+                    kwargs = {}
+                    if loss == "labels":
+                        # Logging is enabled only for the last layer
+                        kwargs = {"log": False}
+                    l_dict = self.get_loss(
+                        loss, aux_outputs, targets, indices, num_boxes, **kwargs
+                    )
+                    l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+        return losses
+
+    def forward(self, outputs, targets):
+        """This performs the loss computation.
+        Parameters:
+             outputs: dict of tensors, see the output specification of the model for the format
+             targets: list of dicts, such that len(targets) == batch_size.
+                      The expected keys in each dict depends on the losses applied, see each loss' doc
+        """
+        # "pred_logits" shape (B, S, NUM_CLASS + 1)
+        # "pred_boxes" shape (B, S, 4)
+        outputs_without_aux = {k: v for k,
+                               v in outputs.items() if k != "aux_outputs"}
+        return self._forward(outputs, outputs_without_aux, targets)
+
+
+class FocalLossSetCriterion(SetCriterion):
+    """This class computes the loss for DETR.
+    The process happens in two steps:
+        1) we compute hungarian assignment between ground truth boxes and the outputs of the model
+        2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
+    """
+
+    def __init__(self, num_classes, matcher, weight_dict, losses, focal_alpha=0.25):
+        """Create the criterion.
+        Parameters:
+            num_classes: number of object categories, omitting the special no-object category
+            matcher: module able to compute a matching between targets and proposals
+            weight_dict: dict containing as key the names of the losses and as values their relative weight.
+            losses: list of all the losses to be applied. See get_loss for list of available losses.
+            focal_alpha: alpha in Focal Loss
+        """
+        super().__init__(num_classes, matcher, weight_dict, 0, losses)
+        self.focal_alpha = focal_alpha
+
+    def loss_labels(self, outputs, targets, indices, num_boxes, log=True):
+        """Classification loss (NLL)
+        targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
+        """
+        assert "pred_logits" in outputs
+        # shape (batch_size, num_queries, num_classes)
+        src_logits = outputs["pred_logits"]
+
+        idx = self._get_src_permutation_idx(indices)
+        target_classes_o = torch.cat(
+            [t["labels"][J] for t, (_, J) in zip(targets, indices)]
+        )
+        target_classes = torch.full(
+            src_logits.shape[:2],
+            self.num_classes,
+            dtype=torch.int64,
+            device=src_logits.device,
+        )
+        target_classes[idx] = target_classes_o
+
+        target_classes_onehot = torch.zeros(
+            [src_logits.shape[0], src_logits.shape[1], src_logits.shape[2] + 1],
+            dtype=src_logits.dtype,
+            layout=src_logits.layout,
+            device=src_logits.device,
+        )
+        target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
+
+        target_classes_onehot = target_classes_onehot[:, :, :-1]
+        loss_ce = (
+            sigmoid_focal_loss(
+                src_logits,
+                target_classes_onehot,
+                num_boxes,
+                alpha=self.focal_alpha,
+                gamma=2,
+            )
+            * src_logits.shape[1]
+        )
+        losses = {"loss_ce": loss_ce}
+
+        if log:
+            # TODO this should probably be a separate loss, not hacked in this one here
+            losses["class_error"] = 100 - \
+                accuracy(src_logits[idx], target_classes_o)[0]
+        return losses
+
+    def forward(self, outputs, targets):
+        """This performs the loss computation.
+        Parameters:
+             outputs: dict of tensors, see the output specification of the model for the format
+             targets: list of dicts, such that len(targets) == batch_size.
+                      The expected keys in each dict depends on the losses applied, see each loss' doc
+        """
+        outputs_without_aux = {
+            k: v
+            for k, v in outputs.items()
+            if k != "aux_outputs" and k != "enc_outputs"
+        }
+
+        losses = self._forward(outputs, outputs_without_aux, targets)
+
+        if "enc_outputs" in outputs:
+            num_boxes = _reduce_num_boxes(
+                targets, next(iter(outputs.values())).device)
+            enc_outputs = outputs["enc_outputs"]
+            bin_targets = copy.deepcopy(targets)
+            for bt in bin_targets:
+                bt["labels"] = torch.zeros_like(bt["labels"])
+            indices = self.matcher(enc_outputs, bin_targets)
+            for loss in self.losses:
+                if loss == "masks":
+                    # Intermediate masks losses are too costly to compute, we ignore them.
+                    continue
+                kwargs = {}
+                if loss == "labels":
+                    # Logging is enabled only for the last layer
+                    kwargs["log"] = False
+                l_dict = self.get_foreground_background_loss(
+                    loss, enc_outputs, bin_targets, indices, num_boxes, **kwargs
+                )
+                l_dict = {k + "_enc": v for k, v in l_dict.items()}
+                losses.update(l_dict)
+
+        return losses
diff --git a/yolov7/modeling/loss/sparseinst_loss.py b/yolov7/modeling/loss/sparseinst_loss.py
new file mode 100644
index 0000000..a76ab37
--- /dev/null
+++ b/yolov7/modeling/loss/sparseinst_loss.py
@@ -0,0 +1,365 @@
+# Copyright (c) Tianheng Cheng and its affiliates. All Rights Reserved
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from scipy.optimize import linear_sum_assignment
+from fvcore.nn import sigmoid_focal_loss_jit
+
+from detectron2.utils.registry import Registry
+
+from yolov7.utils.misc import nested_masks_from_list, is_dist_avail_and_initialized, get_world_size
+
+SPARSE_INST_MATCHER_REGISTRY = Registry("SPARSE_INST_MATCHER")
+SPARSE_INST_MATCHER_REGISTRY.__doc__ = "Matcher for SparseInst"
+SPARSE_INST_CRITERION_REGISTRY = Registry("SPARSE_INST_CRITERION")
+SPARSE_INST_CRITERION_REGISTRY.__doc__ = "Criterion for SparseInst"
+
+
+def compute_mask_iou(inputs, targets):
+    inputs = inputs.sigmoid()
+    # thresholding
+    binarized_inputs = (inputs >= 0.4).float()
+    targets = (targets > 0.5).float()
+    intersection = (binarized_inputs * targets).sum(-1)
+    union = targets.sum(-1) + binarized_inputs.sum(-1) - intersection
+    score = intersection / (union + 1e-6)
+    return score
+
+
+def dice_score(inputs, targets):
+    inputs = inputs.sigmoid()
+    numerator = 2 * torch.matmul(inputs, targets.t())
+    denominator = (inputs * inputs).sum(-1)[:, None] + (targets * targets).sum(-1)
+    score = numerator / (denominator + 1e-4)
+    return score
+
+
+def dice_loss(inputs, targets, reduction="sum"):
+    inputs = inputs.sigmoid()
+    assert inputs.shape == targets.shape
+    numerator = 2 * (inputs * targets).sum(1)
+    denominator = (inputs * inputs).sum(-1) + (targets * targets).sum(-1)
+    loss = 1 - (numerator) / (denominator + 1e-4)
+    if reduction == "none":
+        return loss
+    return loss.sum()
+
+
+@SPARSE_INST_CRITERION_REGISTRY.register()
+class SparseInstCriterion(nn.Module):
+    # This part is partially derivated from: https://github.com/facebookresearch/detr/blob/main/models/detr.py
+
+    def __init__(self, cfg, matcher):
+        super().__init__()
+        self.matcher = matcher
+        self.losses = cfg.MODEL.SPARSE_INST.LOSS.ITEMS
+        self.weight_dict = self.get_weight_dict(cfg)
+        self.num_classes = cfg.MODEL.SPARSE_INST.DECODER.NUM_CLASSES
+
+    def get_weight_dict(self, cfg):
+        losses = ("loss_ce", "loss_mask", "loss_dice", "loss_objectness")
+        weight_dict = {}
+        ce_weight = cfg.MODEL.SPARSE_INST.LOSS.CLASS_WEIGHT
+        mask_weight = cfg.MODEL.SPARSE_INST.LOSS.MASK_PIXEL_WEIGHT
+        dice_weight = cfg.MODEL.SPARSE_INST.LOSS.MASK_DICE_WEIGHT
+        objectness_weight = cfg.MODEL.SPARSE_INST.LOSS.OBJECTNESS_WEIGHT
+
+        weight_dict = dict(
+            zip(losses, (ce_weight, mask_weight, dice_weight, objectness_weight))
+        )
+        return weight_dict
+
+    def _get_src_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat(
+            [torch.full_like(src, i) for i, (src, _) in enumerate(indices)]
+        )
+        src_idx = torch.cat([src for (src, _) in indices])
+        return batch_idx, src_idx
+
+    def _get_tgt_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat(
+            [torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)]
+        )
+        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
+        return batch_idx, tgt_idx
+
+    def loss_labels(self, outputs, targets, indices, num_instances, input_shape=None):
+        assert "pred_logits" in outputs
+        src_logits = outputs["pred_logits"]
+        idx = self._get_src_permutation_idx(indices)
+        target_classes_o = torch.cat(
+            [t["labels"][J] for t, (_, J) in zip(targets, indices)]
+        )
+        target_classes = torch.full(
+            src_logits.shape[:2],
+            self.num_classes,
+            dtype=torch.int64,
+            device=src_logits.device,
+        )
+        target_classes[idx] = target_classes_o
+
+        src_logits = src_logits.flatten(0, 1)
+        # prepare one_hot target.
+        target_classes = target_classes.flatten(0, 1)
+        pos_inds = torch.nonzero(target_classes != self.num_classes, as_tuple=True)[0]
+        labels = torch.zeros_like(src_logits)
+        labels[pos_inds, target_classes[pos_inds]] = 1
+        # comp focal loss.
+        class_loss = (
+            sigmoid_focal_loss_jit(
+                src_logits,
+                labels,
+                alpha=0.25,
+                gamma=2.0,
+                reduction="sum",
+            )
+            / num_instances
+        )
+        losses = {"loss_ce": class_loss}
+        return losses
+
+    def loss_masks_with_iou_objectness(
+        self, outputs, targets, indices, num_instances, input_shape
+    ):
+        src_idx = self._get_src_permutation_idx(indices)
+        tgt_idx = self._get_tgt_permutation_idx(indices)
+        # Bx100xHxW
+        assert "pred_masks" in outputs
+        assert "pred_scores" in outputs
+        src_iou_scores = outputs["pred_scores"]
+        src_masks = outputs["pred_masks"]
+        with torch.no_grad():
+            target_masks, _ = nested_masks_from_list(
+                [t["masks"].tensor for t in targets], input_shape
+            ).decompose()
+        num_masks = [len(t["masks"]) for t in targets]
+        target_masks = target_masks.to(src_masks)
+        if len(target_masks) == 0:
+            losses = {
+                "loss_dice": src_masks.sum() * 0.0,
+                "loss_mask": src_masks.sum() * 0.0,
+                "loss_objectness": src_iou_scores.sum() * 0.0,
+            }
+            return losses
+
+        src_masks = src_masks[src_idx]
+        target_masks = F.interpolate(
+            target_masks[:, None],
+            size=src_masks.shape[-2:],
+            mode="bilinear",
+            align_corners=False,
+        ).squeeze(1)
+
+        src_masks = src_masks.flatten(1)
+        # FIXME: tgt_idx
+        mix_tgt_idx = torch.zeros_like(tgt_idx[1])
+        cum_sum = 0
+        for num_mask in num_masks:
+            mix_tgt_idx[cum_sum : cum_sum + num_mask] = cum_sum
+            cum_sum += num_mask
+        mix_tgt_idx += tgt_idx[1]
+
+        target_masks = target_masks[mix_tgt_idx].flatten(1)
+
+        with torch.no_grad():
+            ious = compute_mask_iou(src_masks, target_masks)
+
+        tgt_iou_scores = ious
+        src_iou_scores = src_iou_scores[src_idx]
+        tgt_iou_scores = tgt_iou_scores.flatten(0)
+        src_iou_scores = src_iou_scores.flatten(0)
+
+        losses = {
+            "loss_objectness": F.binary_cross_entropy_with_logits(
+                src_iou_scores, tgt_iou_scores, reduction="mean"
+            ),
+            "loss_dice": dice_loss(src_masks, target_masks) / num_instances,
+            "loss_mask": F.binary_cross_entropy_with_logits(
+                src_masks, target_masks, reduction="mean"
+            ),
+        }
+        return losses
+
+    def get_loss(self, loss, outputs, targets, indices, num_instances, **kwargs):
+        loss_map = {
+            "labels": self.loss_labels,
+            "masks": self.loss_masks_with_iou_objectness,
+        }
+        if loss == "loss_objectness":
+            # NOTE: loss_objectness will be calculated in `loss_masks_with_iou_objectness`
+            return {}
+        assert loss in loss_map
+        return loss_map[loss](outputs, targets, indices, num_instances, **kwargs)
+
+    def forward(self, outputs, targets, input_shape):
+
+        outputs_without_aux = {k: v for k, v in outputs.items() if k != "aux_outputs"}
+
+        # Retrieve the matching between the outputs of the last layer and the targets
+        indices = self.matcher(outputs_without_aux, targets, input_shape)
+        # Compute the average number of target boxes accross all nodes, for normalization purposes
+        num_instances = sum(len(t["labels"]) for t in targets)
+        num_instances = torch.as_tensor(
+            [num_instances],
+            dtype=torch.float,
+            device=next(iter(outputs.values())).device,
+        )
+        if is_dist_avail_and_initialized():
+            torch.distributed.all_reduce(num_instances)
+        num_instances = torch.clamp(num_instances / get_world_size(), min=1).item()
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            losses.update(
+                self.get_loss(
+                    loss,
+                    outputs,
+                    targets,
+                    indices,
+                    num_instances,
+                    input_shape=input_shape,
+                )
+            )
+
+        for k in losses.keys():
+            if k in self.weight_dict:
+                losses[k] *= self.weight_dict[k]
+
+        return losses
+
+
+@SPARSE_INST_MATCHER_REGISTRY.register()
+class SparseInstMatcherV1(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.alpha = cfg.MODEL.SPARSE_INST.MATCHER.ALPHA
+        self.beta = cfg.MODEL.SPARSE_INST.MATCHER.BETA
+        self.mask_score = dice_score
+
+    @torch.no_grad()
+    def forward(self, outputs, targets, input_shape):
+        B, N, H, W = outputs["pred_masks"].shape
+        pred_masks = outputs["pred_masks"]
+        pred_logits = outputs["pred_logits"].sigmoid()
+
+        indices = []
+
+        for i in range(B):
+            tgt_ids = targets[i]["labels"]
+            # no annotations
+            if tgt_ids.shape[0] == 0:
+                indices.append((torch.as_tensor([]), torch.as_tensor([])))
+                continue
+
+            tgt_masks = targets[i]["masks"].tensor.to(pred_masks)
+            pred_logit = pred_logits[i]
+            out_masks = pred_masks[i]
+
+            # upsampling:
+            # (1) padding/
+            # (2) upsampling to 1x input size (input_shape)
+            # (3) downsampling to 0.25x input size (output mask size)
+            ori_h, ori_w = tgt_masks.size(1), tgt_masks.size(2)
+            tgt_masks_ = torch.zeros(
+                (1, tgt_masks.size(0), input_shape[0], input_shape[1])
+            ).to(pred_masks)
+            tgt_masks_[0, :, :ori_h, :ori_w] = tgt_masks
+            tgt_masks = F.interpolate(
+                tgt_masks_,
+                size=out_masks.shape[-2:],
+                mode="bilinear",
+                align_corners=False,
+            )[0]
+
+            # compute dice score and classification score
+            tgt_masks = tgt_masks.flatten(1)
+            out_masks = out_masks.flatten(1)
+
+            mask_score = self.mask_score(out_masks, tgt_masks)
+            # Nx(Number of gts)
+            matching_prob = pred_logit[:, tgt_ids]
+            C = (mask_score**self.alpha) * (matching_prob**self.beta)
+            # hungarian matching
+            inds = linear_sum_assignment(C.cpu(), maximize=True)
+            indices.append(inds)
+        return [
+            (
+                torch.as_tensor(i, dtype=torch.int64),
+                torch.as_tensor(j, dtype=torch.int64),
+            )
+            for i, j in indices
+        ]
+
+
+@SPARSE_INST_MATCHER_REGISTRY.register()
+class SparseInstMatcher(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        self.alpha = cfg.MODEL.SPARSE_INST.MATCHER.ALPHA
+        self.beta = cfg.MODEL.SPARSE_INST.MATCHER.BETA
+        self.mask_score = dice_score
+
+    def forward(self, outputs, targets, input_shape):
+        with torch.no_grad():
+            B, N, H, W = outputs["pred_masks"].shape
+            pred_masks = outputs["pred_masks"]
+            pred_logits = outputs["pred_logits"].sigmoid()
+
+            tgt_ids = torch.cat([v["labels"] for v in targets])
+
+            if tgt_ids.shape[0] == 0:
+                return [
+                    (
+                        torch.as_tensor([]).to(pred_logits),
+                        torch.as_tensor([]).to(pred_logits),
+                    )
+                ] * B
+            tgt_masks, _ = nested_masks_from_list(
+                [t["masks"].tensor for t in targets], input_shape
+            ).decompose()
+            device = pred_masks.device
+            tgt_masks = tgt_masks.to(pred_masks)
+
+            tgt_masks = F.interpolate(
+                tgt_masks[:, None],
+                size=pred_masks.shape[-2:],
+                mode="bilinear",
+                align_corners=False,
+            ).squeeze(1)
+
+            pred_masks = pred_masks.view(B * N, -1)
+            tgt_masks = tgt_masks.flatten(1)
+
+            mask_score = self.mask_score(pred_masks, tgt_masks)
+            # Nx(Number of gts)
+            matching_prob = pred_logits.view(B * N, -1)[:, tgt_ids]
+            C = (mask_score**self.alpha) * (matching_prob**self.beta)
+            C = C.view(B, N, -1).cpu()
+            # hungarian matching
+            sizes = [len(v["masks"]) for v in targets]
+            indices = [
+                linear_sum_assignment(c[i], maximize=True)
+                for i, c in enumerate(C.split(sizes, -1))
+            ]
+            indices = [
+                (
+                    torch.as_tensor(i, dtype=torch.int64),
+                    torch.as_tensor(j, dtype=torch.int64),
+                )
+                for i, j in indices
+            ]
+            return indices
+
+
+def build_sparse_inst_matcher(cfg):
+    name = cfg.MODEL.SPARSE_INST.MATCHER.NAME
+    return SPARSE_INST_MATCHER_REGISTRY.get(name)(cfg)
+
+
+def build_sparse_inst_criterion(cfg):
+    matcher = build_sparse_inst_matcher(cfg)
+    name = cfg.MODEL.SPARSE_INST.LOSS.NAME
+    return SPARSE_INST_CRITERION_REGISTRY.get(name)(cfg, matcher)
diff --git a/yolov7/modeling/meta_arch/__init__.py b/yolov7/modeling/meta_arch/__init__.py
old mode 100755
new mode 100644
index dc5ceab..27b3a7b
--- a/yolov7/modeling/meta_arch/__init__.py
+++ b/yolov7/modeling/meta_arch/__init__.py
@@ -6,7 +6,12 @@
 from .yolof import YOLOF
 from .yolox import YOLOX
 from .yolov5 import YOLOV5
+from .yolov6 import YOLOV6
 
 from .solov2 import SOLOv2
 
 from .detr import Detr
+from .anchor_detr import AnchorDetr
+from .smca_detr import SMCADetr
+from .detr_d2go import DetrD2go
+from .sparseinst import SparseInst
\ No newline at end of file
diff --git a/yolov7/modeling/meta_arch/anchor_detr.py b/yolov7/modeling/meta_arch/anchor_detr.py
old mode 100755
new mode 100644
index 4408969..a671ddc
--- a/yolov7/modeling/meta_arch/anchor_detr.py
+++ b/yolov7/modeling/meta_arch/anchor_detr.py
@@ -1,7 +1,7 @@
 # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 import logging
 import math
-from typing import List, Dict
+from typing import List, Dict, OrderedDict
 
 import numpy as np
 import torch
@@ -10,21 +10,24 @@
 from scipy.optimize import linear_sum_assignment
 from torch import nn
 
+from detectron2.utils import comm
 from detectron2.layers import ShapeSpec
 from detectron2.modeling import META_ARCH_REGISTRY, build_backbone, detector_postprocess
 from detectron2.structures import Boxes, ImageList, Instances, BitMasks, PolygonMasks
 from detectron2.utils.logger import log_first_n
 from fvcore.nn import giou_loss, smooth_l1_loss
 
-from yolov7.utils.detr_utils import HungarianMatcher
-from yolov7.utils.boxes import box_cxcywh_to_xyxy, box_xyxy_to_cxcywh, convert_coco_poly_to_mask
-from yolov7.utils.misc import NestedTensor, nested_tensor_from_tensor_list
+from yolov7.utils.detr_utils import HungarianMatcherAnchorDETR
+from yolov7.utils.boxes import box_cxcywh_to_xyxy, box_xyxy_to_cxcywh, convert_coco_poly_to_mask, generalized_box_iou
+from yolov7.utils.misc import NestedTensor, nested_tensor_from_tensor_list, accuracy
 
 from alfred.utils.log import logger
 
-from ..backbone.detr_backbone import Joiner, PositionEmbeddingSine, Transformer
-from .detr_seg import DETRsegm, PostProcessPanoptic, PostProcessSegm
-
+from ..backbone.detr_backbone import Joiner, PositionEmbeddingSine
+from ..backbone.anchordetr_backbone import Transformer
+from .detr_seg import DETRsegm, PostProcessPanoptic, PostProcessSegm, sigmoid_focal_loss, dice_loss
+from alfred.dl.torch.common import device
+import pickle
 
 __all__ = ["AnchorDetr"]
 
@@ -34,22 +37,23 @@ class AnchorDetr(nn.Module):
     """
     Implement AnchorDetr
     """
+
     def __init__(self, cfg):
         super().__init__()
 
         self.device = torch.device(cfg.MODEL.DEVICE)
-        self.ignore_thresh = cfg.MODEL.YOLO.CONF_THRESHOLD
+        self.conf_thresh = cfg.MODEL.YOLO.CONF_THRESHOLD
+        self.ignore_thresh = cfg.MODEL.YOLO.IGNORE_THRESHOLD
         self.num_classes = cfg.MODEL.DETR.NUM_CLASSES
         self.mask_on = cfg.MODEL.MASK_ON
         hidden_dim = cfg.MODEL.DETR.HIDDEN_DIM
-        num_queries = cfg.MODEL.DETR.NUM_OBJECT_QUERIES
         # Transformer parameters:
         nheads = cfg.MODEL.DETR.NHEADS
         dropout = cfg.MODEL.DETR.DROPOUT
         dim_feedforward = cfg.MODEL.DETR.DIM_FEEDFORWARD
         enc_layers = cfg.MODEL.DETR.ENC_LAYERS
         dec_layers = cfg.MODEL.DETR.DEC_LAYERS
-        pre_norm = cfg.MODEL.DETR.PRE_NORM
+        num_feature_levels = cfg.MODEL.DETR.NUM_FEATURE_LEVELS
 
         # Loss parameters:
         giou_weight = cfg.MODEL.DETR.GIOU_WEIGHT
@@ -57,31 +61,38 @@ def __init__(self, cfg):
         deep_supervision = cfg.MODEL.DETR.DEEP_SUPERVISION
         no_object_weight = cfg.MODEL.DETR.NO_OBJECT_WEIGHT
 
-        N_steps = hidden_dim // 2
-        d2_backbone = MaskedBackboneTraceFriendly(cfg)
-        # d2_backbone = MaskedBackbone(cfg)
-        backbone = Joiner(d2_backbone, PositionEmbeddingSine(N_steps, normalize=True))
-        backbone.num_channels = d2_backbone.num_channels
+        num_query_position = cfg.MODEL.DETR.NUM_QUERY_POSITION
+        num_query_pattern = cfg.MODEL.DETR.NUM_QUERY_PATTERN
+        spatial_prior = cfg.MODEL.DETR.SPATIAL_PRIOR
+
+        backbone = MaskedBackboneTraceFriendly(cfg)
 
         transformer = Transformer(
+            num_classes=self.num_classes+1,
             d_model=hidden_dim,
             dropout=dropout,
             nhead=nheads,
+            num_feature_levels=num_feature_levels,
             dim_feedforward=dim_feedforward,
             num_encoder_layers=enc_layers,
             num_decoder_layers=dec_layers,
-            normalize_before=pre_norm,
-            return_intermediate_dec=deep_supervision,
+            activation="relu",
+            num_query_position=num_query_position,
+            num_query_pattern=num_query_pattern,
+            spatial_prior=spatial_prior,
         )
 
-        self.detr = DETR(
-            backbone, transformer, num_classes=self.num_classes, num_queries=num_queries, aux_loss=deep_supervision
-        )
+        self.detr = AnchorDETR(backbone,
+                               transformer,
+                               num_feature_levels,
+                               aux_loss=deep_supervision)
         if self.mask_on:
             frozen_weights = cfg.MODEL.DETR.FROZEN_WEIGHTS
             if frozen_weights != '':
                 print("LOAD pre-trained weights")
-                weight = torch.load(frozen_weights, map_location=lambda storage, loc: storage)['model']
+                weight = torch.load(
+                    frozen_weights,
+                    map_location=lambda storage, loc: storage)['model']
                 new_weight = {}
                 for k, v in weight.items():
                     if 'detr.' in k:
@@ -97,28 +108,38 @@ def __init__(self, cfg):
         self.detr.to(self.device)
 
         # building criterion
-        matcher = HungarianMatcher(cost_class=1, cost_bbox=l1_weight, cost_giou=giou_weight)
-        weight_dict = {"loss_ce": 1, "loss_bbox": l1_weight}
+        matcher = HungarianMatcherAnchorDETR(cost_class=1,
+                                   cost_bbox=l1_weight,
+                                   cost_giou=giou_weight)
+        weight_dict = {"loss_ce": 2, "loss_bbox": l1_weight}
         weight_dict["loss_giou"] = giou_weight
         if deep_supervision:
             aux_weight_dict = {}
             for i in range(dec_layers - 1):
-                aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
+                aux_weight_dict.update(
+                    {k + f"_{i}": v
+                     for k, v in weight_dict.items()})
             weight_dict.update(aux_weight_dict)
         losses = ["labels", "boxes", "cardinality"]
         if self.mask_on:
             losses += ["masks"]
         self.criterion = SetCriterion(
-            self.num_classes, matcher=matcher, weight_dict=weight_dict, eos_coef=no_object_weight, losses=losses,
+            self.num_classes+1,
+            matcher=matcher,
+            weight_dict=weight_dict,
+            # eos_coef=no_object_weight,
+            losses=losses,
         )
         self.criterion.to(self.device)
 
-        pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(3, 1, 1)
-        pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(3, 1, 1)
+        pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(
+            3, 1, 1)
+        pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(
+            3, 1, 1)
         self.normalizer = lambda x: (x - pixel_mean) / pixel_std
         self.to(self.device)
         self.onnx_export = False
-    
+
     def preprocess_input(self, x):
         # x = x.permute(0, 3, 1, 2)
         # x = F.interpolate(x, size=(640, 640))
@@ -146,14 +167,17 @@ def forward(self, batched_inputs):
         """
         if self.onnx_export:
             logger.info('[WARN] exporting onnx...')
-            assert isinstance(batched_inputs, (list, torch.Tensor)) or isinstance(
-                batched_inputs, list), 'onnx export, batched_inputs only needs image tensor or list of tensors'
+            assert isinstance(
+                batched_inputs, (list, torch.Tensor)
+            ) or isinstance(
+                batched_inputs, list
+            ), 'onnx export, batched_inputs only needs image tensor or list of tensors'
             images = self.preprocess_input(batched_inputs)
             # batched_inputs = batched_inputs.permute(0, 3, 1, 2)
             # image_ori_sizes = [batched_inputs.shape[1:3]]
         else:
             images = self.preprocess_image(batched_inputs)
-        
+
         if self.onnx_export:
             self.detr.onnx_export = self.onnx_export
             self.detr.backbone.prepare_onnx_export()
@@ -161,15 +185,21 @@ def forward(self, batched_inputs):
         output = self.detr(images)
 
         if self.training:
-            gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+            gt_instances = [
+                x["instances"].to(self.device) for x in batched_inputs
+            ]
 
             targets = self.prepare_targets(gt_instances)
             loss_dict = self.criterion(output, targets)
             weight_dict = self.criterion.weight_dict
+            valid_loss_dict = {}
             for k in loss_dict.keys():
                 if k in weight_dict:
-                    loss_dict[k] *= weight_dict[k]
-            return loss_dict
+                    valid_loss_dict[k] = loss_dict[k] * weight_dict[k]
+                    # loss_dict[k] *= weight_dict[k]
+            # print(loss_dict)
+            # return loss_dict
+            return valid_loss_dict
         else:
             if self.onnx_export:
                 box_cls = output[0]
@@ -179,15 +209,21 @@ def forward(self, batched_inputs):
                 labels = labels.to(torch.float)
                 print(scores.shape)
                 # print(scores.unsqueeze(0).shape)
-                a = torch.cat([box_pred, scores.unsqueeze(-1), labels.unsqueeze(-1)], dim=-1)
+                a = torch.cat(
+                    [box_pred,
+                     scores.unsqueeze(-1),
+                     labels.unsqueeze(-1)],
+                    dim=-1)
                 return a
             else:
                 box_cls = output["pred_logits"]
                 box_pred = output["pred_boxes"]
                 mask_pred = output["pred_masks"] if self.mask_on else None
-                results = self.inference(box_cls, box_pred, mask_pred, images.image_sizes)
+                results = self.inference(box_cls, box_pred, mask_pred,
+                                         images.image_sizes)
                 processed_results = []
-                for results_per_image, input_per_image, image_size in zip(results, batched_inputs, images.image_sizes):
+                for results_per_image, input_per_image, image_size in zip(
+                        results, batched_inputs, images.image_sizes):
                     height = input_per_image.get("height", image_size[0])
                     width = input_per_image.get("width", image_size[1])
                     r = detector_postprocess(results_per_image, height, width)
@@ -198,7 +234,9 @@ def prepare_targets(self, targets):
         new_targets = []
         for targets_per_image in targets:
             h, w = targets_per_image.image_size
-            image_size_xyxy = torch.as_tensor([w, h, w, h], dtype=torch.float, device=self.device)
+            image_size_xyxy = torch.as_tensor([w, h, w, h],
+                                              dtype=torch.float,
+                                              device=self.device)
             gt_classes = targets_per_image.gt_classes
             gt_boxes = targets_per_image.gt_boxes.tensor / image_size_xyxy
             gt_boxes = box_xyxy_to_cxcywh(gt_boxes)
@@ -224,33 +262,41 @@ def inference(self, box_cls, box_pred, mask_pred, image_sizes):
         assert len(box_cls) == len(image_sizes)
         results = []
 
-        # For each box we assign the best class or the second best if the best on is `no_object`.
-        scores, labels = F.softmax(box_cls, dim=-1)[:, :, :-1].max(-1)
+        prob = box_cls.sigmoid()
+        # TODO make top-100 as an option for non-focal-loss as well
+        scores, topk_indexes = torch.topk(
+            prob.view(box_cls.shape[0], -1), 100, dim=1
+        )
+        topk_boxes = topk_indexes // box_cls.shape[2]
+        labels = topk_indexes % box_cls.shape[2]
 
         for i, (scores_per_image, labels_per_image, box_pred_per_image, image_size) in enumerate(zip(
             scores, labels, box_pred, image_sizes
         )):
-            indexes = scores_per_image > self.ignore_thresh
-            scores_per_image = scores_per_image[indexes]
-            labels_per_image = labels_per_image[indexes]
-            box_pred_per_image = box_pred_per_image[indexes]
-
             result = Instances(image_size)
-            result.pred_boxes = Boxes(box_cxcywh_to_xyxy(box_pred_per_image))
+            boxes = box_cxcywh_to_xyxy(box_pred_per_image)
+            boxes = torch.gather(
+                boxes, 0, topk_boxes[i].unsqueeze(-1).repeat(1, 4))
+            result.pred_boxes = Boxes(boxes)
 
-            result.pred_boxes.scale(scale_x=image_size[1], scale_y=image_size[0])
+            result.pred_boxes.scale(
+                scale_x=image_size[1], scale_y=image_size[0])
             if self.mask_on:
-                mask = F.interpolate(mask_pred[i].unsqueeze(0), size=image_size, mode='bilinear', align_corners=False)
+                mask = F.interpolate(mask_pred[i].unsqueeze(
+                    0), size=image_size, mode='bilinear', align_corners=False)
                 mask = mask[0].sigmoid() > 0.5
                 B, N, H, W = mask_pred.shape
-                mask = BitMasks(mask.cpu()).crop_and_resize(result.pred_boxes.tensor.cpu(), 32)
-                result.pred_masks = mask.unsqueeze(1).to(mask_pred[0].device)
-
+                # print('mask_pred shape: ', mask.shape)
+                # mask = BitMasks(mask.cpu()).crop_and_resize(result.pred_boxes.tensor.cpu(), 32)
+                mask = BitMasks(mask.cpu())
+                # result.pred_masks = mask.unsqueeze(1).to(mask_pred[0].device)
+                result.pred_bit_masks = mask.to(mask_pred[i].device)
+            # print('box_pred_per_image: ', box_pred_per_image.shape)
             result.scores = scores_per_image
             result.pred_classes = labels_per_image
             results.append(result)
         return results
-    
+
     def inference_onnx(self, box_cls, box_pred, mask_pred, image_sizes):
         """
         appending indices as one of output for convinient select ??
@@ -261,7 +307,9 @@ def preprocess_image(self, batched_inputs):
         """
         Normalize, pad and batch the input images.
         """
-        images = [self.normalizer(x["image"].to(self.device)) for x in batched_inputs]
+        images = [
+            self.normalizer(x["image"].to(self.device)) for x in batched_inputs
+        ]
         images = ImageList.from_tensors(images)
         return images
 
@@ -273,7 +321,8 @@ def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
         super().__init__()
         self.num_layers = num_layers
         h = [hidden_dim] * (num_layers - 1)
-        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+        self.layers = nn.ModuleList(
+            nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
 
     def forward(self, x):
         for i, layer in enumerate(self.layers):
@@ -281,49 +330,6 @@ def forward(self, x):
         return x
 
 
-class MaskedBackbone(nn.Module):
-    """ This is a thin wrapper around D2's backbone to provide padding masking"""
-
-    def __init__(self, cfg):
-        super().__init__()
-        self.backbone = build_backbone(cfg)
-        backbone_shape = self.backbone.output_shape()
-        self.feature_strides = [backbone_shape[f].stride for f in backbone_shape.keys()]
-        self.num_channels = backbone_shape[list(backbone_shape.keys())[-1]].channels
-
-    def forward(self, images):
-        if isinstance(images, ImageList):
-            features = self.backbone(images.tensor)
-            device = images.tensor.device
-        else:
-            features = self.backbone(images.tensors)
-            device = images.tensors.device
-        masks = self.mask_out_padding(
-            [features_per_level.shape for features_per_level in features.values()],
-            images.image_sizes,
-            device,
-        )
-        assert len(features) == len(masks)
-        for i, k in enumerate(features.keys()):
-            features[k] = NestedTensor(features[k], masks[i])
-        return features
-
-    def mask_out_padding(self, feature_shapes, image_sizes, device):
-        masks = []
-        assert len(feature_shapes) == len(self.feature_strides)
-        for idx, shape in enumerate(feature_shapes):
-            N, _, H, W = shape
-            masks_per_feature_level = torch.ones((N, H, W), dtype=torch.bool, device=device)
-            for img_idx, (h, w) in enumerate(image_sizes):
-                masks_per_feature_level[
-                    img_idx,
-                    : int(np.ceil(float(h) / self.feature_strides[idx])),
-                    : int(np.ceil(float(w) / self.feature_strides[idx])),
-                ] = 0
-            masks.append(masks_per_feature_level)
-        return masks
-
-
 class MaskedBackboneTraceFriendly(nn.Module):
     """ 
     This is a thin wrapper around D2's backbone to provide padding masking.
@@ -334,8 +340,44 @@ def __init__(self, cfg):
         super().__init__()
         self.backbone = build_backbone(cfg)
         backbone_shape = self.backbone.output_shape()
-        self.feature_strides = [backbone_shape[f].stride for f in backbone_shape.keys()]
-        self.num_channels = backbone_shape[list(backbone_shape.keys())[-1]].channels
+        self.num_feature_levels = cfg.MODEL.DETR.NUM_FEATURE_LEVELS
+
+        # if comm.is_main_process():
+        #     a = torch.randn([1, 3, 256, 256])
+        #     b = self.backbone(a)
+        #     print('B: ', b)
+        # self.backbone = torchvision.models.resnet50(pretrained=True)
+        backbone_shape = self.backbone.output_shape()
+
+        # pretrained_weights = cfg.MODEL.WEIGHTS
+        # if pretrained_weights:
+        #     logger.info(f'Loading pretrained weights from: {pretrained_weights}')
+        #     with open(pretrained_weights, 'rb') as f:
+        #         wgts = pickle.load(f, encoding='latin1')['model']
+        #     # wgts = torch.load(pretrained_weights, map_location=lambda storage, loc: storage)
+        #     new_weight = {}
+        #     for k, v in wgts.items():
+        #         v = torch.from_numpy(v)
+        #         # new_weight['detr.' + k] = v
+        #         new_weight[k] = v
+        #     del wgts
+        #     self.backbone.load_state_dict(new_weight, strict=False)
+        #     del new_weight
+        
+        # if comm.is_main_process():
+        #     c = self.backbone(a)
+        #     print('C: ', c)
+
+        if self.num_feature_levels > 1:
+            self.num_channels = [512, 1024, 2048]
+            self.return_interm_layers = ['res3', 'res4', 'res5']
+            self.feature_strides = [8, 16, 32]
+        else:
+            self.num_channels = [2048]
+            self.return_interm_layers = ['res5']
+            self.feature_strides = [32]
+            
+        print(self.num_channels)
         self.onnx_export = False
 
     def forward(self, images):
@@ -363,57 +405,90 @@ def forward(self, images):
                 out[name] = NestedTensor(x, mask)
             return out
         else:
+            # features: res2, res3, res4, res5
+            features_returned = OrderedDict()
+            for l in self.return_interm_layers:
+                features_returned[l] = features[l]
+
             masks = self.mask_out_padding(
-                [features_per_level.shape for features_per_level in features.values()],
+                [
+                    features_per_level.shape
+                    for features_per_level in features_returned.values()
+                ],
                 images.image_sizes,
                 device,
             )
-            assert len(features) == len(masks)
-            for i, k in enumerate(features.keys()):
-                features[k] = NestedTensor(features[k], masks[i])
-            return features
+            assert len(features_returned) == len(masks)
+            out_nested_features = []
+            
+            for i, k in enumerate(self.return_interm_layers):
+                out_nested_features.append(NestedTensor(features_returned[k], masks[i]))
+            return out_nested_features
 
     def mask_out_padding(self, feature_shapes, image_sizes, device):
         masks = []
         assert len(feature_shapes) == len(self.feature_strides)
         for idx, shape in enumerate(feature_shapes):
             N, _, H, W = shape
-            masks_per_feature_level = torch.ones((N, H, W), dtype=torch.bool, device=device)
+            masks_per_feature_level = torch.ones((N, H, W),
+                                                 dtype=torch.bool,
+                                                 device=device)
             for img_idx, (h, w) in enumerate(image_sizes):
                 # print('H', H, 'W', W, 'ceil: ', int(np.ceil(float(h) / self.feature_strides[idx])),)
-                masks_per_feature_level[
-                    img_idx,
-                    : int(np.ceil(float(h) / self.feature_strides[idx])),
-                    : int(np.ceil(float(w) / self.feature_strides[idx])),
-                ] = 0
+                masks_per_feature_level[img_idx, :int(
+                    np.ceil(float(h) / self.feature_strides[idx])
+                ), :int(np.ceil(float(w) / self.feature_strides[idx])), ] = 0
             masks.append(masks_per_feature_level)
         return masks
 
 
-class DETR(nn.Module):
-    """ This is the DETR module that performs object detection """
-    def __init__(self, backbone, transformer, num_classes, num_queries, aux_loss=False):
+class AnchorDETR(nn.Module):
+    """ This is the AnchorDETR module that performs object detection """
+
+    def __init__(self, backbone, transformer, num_feature_levels, aux_loss=True):
         """ Initializes the model.
         Parameters:
             backbone: torch module of the backbone to be used. See backbone.py
             transformer: torch module of the transformer architecture. See transformer.py
             num_classes: number of object classes
-            num_queries: number of object queries, ie detection slot. This is the maximal number of objects
-                         DETR can detect in a single image. For COCO, we recommend 100 queries.
             aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
         """
         super().__init__()
-        self.num_queries = num_queries
         self.transformer = transformer
         hidden_dim = transformer.d_model
-        self.class_embed = nn.Linear(hidden_dim, num_classes + 1)
-        self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
-        self.query_embed = nn.Embedding(num_queries, hidden_dim)
-        self.input_proj = nn.Conv2d(backbone.num_channels, hidden_dim, kernel_size=1)
+
+        self.num_feature_levels = num_feature_levels
+        logger.info(f'{backbone.num_channels}')
+        if num_feature_levels > 1:
+            num_backbone_outs = len(backbone.strides)
+            input_proj_list = []
+            for _ in range(num_backbone_outs):
+                in_channels = backbone.num_channels[_]
+                if _ == 0:
+                    input_proj_list.append(nn.Sequential(
+                        nn.Conv2d(in_channels, hidden_dim, kernel_size=3, stride=2, padding=1),
+                        nn.GroupNorm(32, hidden_dim),
+                    ))
+                else:
+                    input_proj_list.append(nn.Sequential(
+                        nn.Conv2d(in_channels, hidden_dim, kernel_size=1),
+                        nn.GroupNorm(32, hidden_dim),
+                    ))
+            self.input_proj = nn.ModuleList(input_proj_list)
+        else:
+            self.input_proj = nn.ModuleList([
+                nn.Sequential(
+                    nn.Conv2d(backbone.num_channels[0], hidden_dim, kernel_size=1),
+                    nn.GroupNorm(32, hidden_dim),
+                )])
         self.backbone = backbone
         self.aux_loss = aux_loss
         self.onnx_export = False
 
+        for proj in self.input_proj:
+            nn.init.xavier_uniform_(proj[0].weight, gain=1)
+            nn.init.constant_(proj[0].bias, 0)
+
     def forward(self, samples: NestedTensor):
         """ The forward expects a NestedTensor, which consists of:
                - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
@@ -429,26 +504,33 @@ def forward(self, samples: NestedTensor):
                - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
                                 dictionnaries containing the two above keys for each decoder layer.
         """
-        if isinstance(samples, (list, torch.Tensor)):
+        if not isinstance(samples, NestedTensor):
             samples = nested_tensor_from_tensor_list(samples)
-            # print('samples: ', samples.shape)
-        # print(samples, type(samples))
-        features, pos = self.backbone(samples)
-        # print(features, 'features')
-
-        src, mask = features[-1].decompose()
-        assert mask is not None
-        hs = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos[-1])[0]
-
-        outputs_class = self.class_embed(hs)
-        outputs_coord = self.bbox_embed(hs)
-        outputs_coord = torch.sigmoid(outputs_coord)
-        if self.onnx_export:    
+        features = self.backbone(samples)
+        # print(features)
+
+        srcs = []
+        masks = []
+        for l, feat in enumerate(features):
+            src, mask = feat.decompose()
+            srcs.append(self.input_proj[l](src).unsqueeze(1))
+            masks.append(mask)
+            assert mask is not None
+
+        srcs = torch.cat(srcs, dim=1)
+
+        outputs_class, outputs_coord = self.transformer(srcs, masks)
+
+        if self.onnx_export:
             return outputs_class[-1], outputs_coord[-1]
         else:
-            out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1]}
+            out = {
+                'pred_logits': outputs_class[-1],
+                'pred_boxes': outputs_coord[-1]
+            }
             if self.aux_loss:
-                out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord)
+                out['aux_outputs'] = self._set_aux_loss(
+                    outputs_class, outputs_coord)
             return out
 
     @torch.jit.unused
@@ -456,8 +538,10 @@ def _set_aux_loss(self, outputs_class, outputs_coord):
         # this is a workaround to make torchscript happy, as torchscript
         # doesn't support dictionary with non-homogeneous values, such
         # as a dict having both a Tensor and a list.
-        return [{'pred_logits': a, 'pred_boxes': b}
-                for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
+        return [{
+            'pred_logits': a,
+            'pred_boxes': b
+        } for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
 
 
 class SetCriterion(nn.Module):
@@ -466,24 +550,22 @@ class SetCriterion(nn.Module):
         1) we compute hungarian assignment between ground truth boxes and the outputs of the model
         2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
     """
-    def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses):
+
+    def __init__(self, num_classes, matcher, weight_dict, losses, focal_alpha=0.25):
         """ Create the criterion.
         Parameters:
             num_classes: number of object categories, omitting the special no-object category
             matcher: module able to compute a matching between targets and proposals
             weight_dict: dict containing as key the names of the losses and as values their relative weight.
-            eos_coef: relative classification weight applied to the no-object category
             losses: list of all the losses to be applied. See get_loss for list of available losses.
+            focal_alpha: alpha in Focal Loss
         """
         super().__init__()
         self.num_classes = num_classes
         self.matcher = matcher
         self.weight_dict = weight_dict
-        self.eos_coef = eos_coef
         self.losses = losses
-        empty_weight = torch.ones(self.num_classes + 1)
-        empty_weight[-1] = self.eos_coef
-        self.register_buffer('empty_weight', empty_weight)
+        self.focal_alpha = focal_alpha
 
     def loss_labels(self, outputs, targets, indices, num_boxes, log=True):
         """Classification loss (NLL)
@@ -493,12 +575,19 @@ def loss_labels(self, outputs, targets, indices, num_boxes, log=True):
         src_logits = outputs['pred_logits']
 
         idx = self._get_src_permutation_idx(indices)
+
         target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
         target_classes = torch.full(src_logits.shape[:2], self.num_classes,
                                     dtype=torch.int64, device=src_logits.device)
         target_classes[idx] = target_classes_o
 
-        loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight)
+        target_classes_onehot = torch.zeros([src_logits.shape[0], src_logits.shape[1], src_logits.shape[2] + 1],
+                                            dtype=src_logits.dtype, layout=src_logits.layout, device=src_logits.device)
+        target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
+
+        target_classes_onehot = target_classes_onehot[:, :, :-1]
+        loss_ce = sigmoid_focal_loss(src_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2) * \
+                  src_logits.shape[1]
         losses = {'loss_ce': loss_ce}
 
         if log:
@@ -523,7 +612,7 @@ def loss_cardinality(self, outputs, targets, indices, num_boxes):
     def loss_boxes(self, outputs, targets, indices, num_boxes):
         """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
            targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
-           The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size.
+           The target boxes are expected in format (center_x, center_y, h, w), normalized by the image size.
         """
         assert 'pred_boxes' in outputs
         idx = self._get_src_permutation_idx(indices)
@@ -535,9 +624,9 @@ def loss_boxes(self, outputs, targets, indices, num_boxes):
         losses = {}
         losses['loss_bbox'] = loss_bbox.sum() / num_boxes
 
-        loss_giou = 1 - torch.diag(box_ops.generalized_box_iou(
-            box_ops.box_cxcywh_to_xyxy(src_boxes),
-            box_ops.box_cxcywh_to_xyxy(target_boxes)))
+        loss_giou = 1 - torch.diag(generalized_box_iou(
+            box_cxcywh_to_xyxy(src_boxes),
+            box_cxcywh_to_xyxy(target_boxes)))
         losses['loss_giou'] = loss_giou.sum() / num_boxes
         return losses
 
@@ -549,21 +638,21 @@ def loss_masks(self, outputs, targets, indices, num_boxes):
 
         src_idx = self._get_src_permutation_idx(indices)
         tgt_idx = self._get_tgt_permutation_idx(indices)
+
         src_masks = outputs["pred_masks"]
-        src_masks = src_masks[src_idx]
-        masks = [t["masks"] for t in targets]
+
         # TODO use valid to mask invalid areas due to padding in loss
-        target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
+        target_masks, valid = nested_tensor_from_tensor_list([t["masks"] for t in targets]).decompose()
         target_masks = target_masks.to(src_masks)
-        target_masks = target_masks[tgt_idx]
 
+        src_masks = src_masks[src_idx]
         # upsample predictions to the target size
         src_masks = interpolate(src_masks[:, None], size=target_masks.shape[-2:],
                                 mode="bilinear", align_corners=False)
         src_masks = src_masks[:, 0].flatten(1)
 
-        target_masks = target_masks.flatten(1)
-        target_masks = target_masks.view(src_masks.shape)
+        target_masks = target_masks[tgt_idx].flatten(1)
+
         losses = {
             "loss_mask": sigmoid_focal_loss(src_masks, target_masks, num_boxes),
             "loss_dice": dice_loss(src_masks, target_masks, num_boxes),
@@ -599,7 +688,7 @@ def forward(self, outputs, targets):
              targets: list of dicts, such that len(targets) == batch_size.
                       The expected keys in each dict depends on the losses applied, see each loss' doc
         """
-        outputs_without_aux = {k: v for k, v in outputs.items() if k != 'aux_outputs'}
+        outputs_without_aux = {k: v for k, v in outputs.items() if k != 'aux_outputs' and k != 'enc_outputs'}
 
         # Retrieve the matching between the outputs of the last layer and the targets
         indices = self.matcher(outputs_without_aux, targets)
@@ -607,15 +696,16 @@ def forward(self, outputs, targets):
         # Compute the average number of target boxes accross all nodes, for normalization purposes
         num_boxes = sum(len(t["labels"]) for t in targets)
         num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
-        if is_dist_avail_and_initialized():
+        if comm.get_world_size() > 1:
             torch.distributed.all_reduce(num_boxes)
-        num_boxes = torch.clamp(num_boxes / get_world_size(), min=1).item()
+        num_boxes = torch.clamp(num_boxes / comm.get_world_size(), min=1).item()
 
         # Compute all the requested losses
         losses = {}
         for loss in self.losses:
-            losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))
-
+            kwargs = {}
+            losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes, **kwargs))
+        # print(losses)
         # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
         if 'aux_outputs' in outputs:
             for i, aux_outputs in enumerate(outputs['aux_outputs']):
@@ -627,11 +717,28 @@ def forward(self, outputs, targets):
                     kwargs = {}
                     if loss == 'labels':
                         # Logging is enabled only for the last layer
-                        kwargs = {'log': False}
+                        kwargs['log'] = False
                     l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_boxes, **kwargs)
                     l_dict = {k + f'_{i}': v for k, v in l_dict.items()}
                     losses.update(l_dict)
 
+        if 'enc_outputs' in outputs:
+            enc_outputs = outputs['enc_outputs']
+            bin_targets = copy.deepcopy(targets)
+            for bt in bin_targets:
+                bt['labels'] = torch.zeros_like(bt['labels'])
+            indices = self.matcher(enc_outputs, bin_targets)
+            for loss in self.losses:
+                if loss == 'masks':
+                    # Intermediate masks losses are too costly to compute, we ignore them.
+                    continue
+                kwargs = {}
+                if loss == 'labels':
+                    # Logging is enabled only for the last layer
+                    kwargs['log'] = False
+                l_dict = self.get_loss(loss, enc_outputs, bin_targets, indices, num_boxes, **kwargs)
+                l_dict = {k + f'_enc': v for k, v in l_dict.items()}
+                losses.update(l_dict)
         return losses
 
 
@@ -655,16 +762,16 @@ def forward(self, outputs, target_sizes):
         scores, labels = prob[..., :-1].max(-1)
 
         # convert to [x0, y0, x1, y1] format
-        boxes = box_ops.box_cxcywh_to_xyxy(out_bbox)
+        boxes = box_cxcywh_to_xyxy(out_bbox)
         # and from relative [0, 1] to absolute [0, height] coordinates
         img_h, img_w = target_sizes.unbind(1)
         scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
         boxes = boxes * scale_fct[:, None, :]
 
-        results = [{'scores': s, 'labels': l, 'boxes': b} for s, l, b in zip(scores, labels, boxes)]
+        results = [{
+            'scores': s,
+            'labels': l,
+            'boxes': b
+        } for s, l, b in zip(scores, labels, boxes)]
 
         return results
-
-
-
-
diff --git a/yolov7/modeling/meta_arch/dab_detr.py b/yolov7/modeling/meta_arch/dab_detr.py
new file mode 100644
index 0000000..8b8c64e
--- /dev/null
+++ b/yolov7/modeling/meta_arch/dab_detr.py
@@ -0,0 +1,742 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import logging
+import math
+from typing import List, Dict
+
+import numpy as np
+import torch
+from detectron2.utils import comm
+import torch.nn.functional as F
+from scipy.optimize import linear_sum_assignment
+from torch import nn
+import torch.distributed as dist
+
+from detectron2.layers import ShapeSpec
+from detectron2.modeling import META_ARCH_REGISTRY, build_backbone, detector_postprocess
+from detectron2.structures import Boxes, ImageList, Instances, BitMasks, PolygonMasks
+from detectron2.utils.logger import log_first_n
+from fvcore.nn import giou_loss, smooth_l1_loss
+
+from yolov7.utils.detr_utils import HungarianMatcher
+from yolov7.utils.boxes import box_cxcywh_to_xyxy, box_xyxy_to_cxcywh, convert_coco_poly_to_mask, generalized_box_iou
+from yolov7.utils.misc import NestedTensor, nested_tensor_from_tensor_list, is_dist_avail_and_initialized, accuracy
+from yolov7.utils.misc import inverse_sigmoid
+from alfred.utils.log import logger
+
+from ..backbone.detr_backbone import Joiner, PositionEmbeddingSine, Transformer
+from .detr_seg import DETRsegm, PostProcessPanoptic, PostProcessSegm, sigmoid_focal_loss, dice_loss
+
+
+__all__ = ["DABDetr"]
+
+
+@META_ARCH_REGISTRY.register()
+class DABDetr(nn.Module):
+    """
+    Implement DABDetr
+    """
+    def __init__(self, cfg):
+        super().__init__()
+
+        self.device = torch.device(cfg.MODEL.DEVICE)
+        self.ignore_thresh = cfg.MODEL.YOLO.CONF_THRESHOLD
+        self.num_classes = cfg.MODEL.DETR.NUM_CLASSES
+        self.mask_on = cfg.MODEL.MASK_ON
+        hidden_dim = cfg.MODEL.DETR.HIDDEN_DIM
+        num_queries = cfg.MODEL.DETR.NUM_OBJECT_QUERIES
+        # Transformer parameters:
+        nheads = cfg.MODEL.DETR.NHEADS
+        dropout = cfg.MODEL.DETR.DROPOUT
+        dim_feedforward = cfg.MODEL.DETR.DIM_FEEDFORWARD
+        enc_layers = cfg.MODEL.DETR.ENC_LAYERS
+        dec_layers = cfg.MODEL.DETR.DEC_LAYERS
+        pre_norm = cfg.MODEL.DETR.PRE_NORM
+
+        # Loss parameters:
+        giou_weight = cfg.MODEL.DETR.GIOU_WEIGHT
+        l1_weight = cfg.MODEL.DETR.L1_WEIGHT
+        deep_supervision = cfg.MODEL.DETR.DEEP_SUPERVISION
+        no_object_weight = cfg.MODEL.DETR.NO_OBJECT_WEIGHT
+
+        N_steps = hidden_dim // 2
+        d2_backbone = MaskedBackboneTraceFriendly(cfg)
+        # d2_backbone = MaskedBackbone(cfg)
+        backbone = Joiner(d2_backbone, PositionEmbeddingSine(N_steps, normalize=True))
+        backbone.num_channels = d2_backbone.num_channels
+
+        transformer = Transformer(
+            d_model=hidden_dim,
+            dropout=dropout,
+            nhead=nheads,
+            dim_feedforward=dim_feedforward,
+            num_encoder_layers=enc_layers,
+            num_decoder_layers=dec_layers,
+            normalize_before=pre_norm,
+            return_intermediate_dec=deep_supervision,
+        )
+
+        self.detr = DABDETR(
+            backbone, transformer, num_classes=self.num_classes, num_queries=num_queries, aux_loss=deep_supervision
+        )
+        if self.mask_on:
+            frozen_weights = cfg.MODEL.DETR.FROZEN_WEIGHTS
+            if frozen_weights != '':
+                print("LOAD pre-trained weights")
+                weight = torch.load(frozen_weights, map_location=lambda storage, loc: storage)['model']
+                new_weight = {}
+                for k, v in weight.items():
+                    if 'detr.' in k:
+                        new_weight[k.replace('detr.', '')] = v
+                    else:
+                        print(f"Skipping loading weight {k} from frozen model")
+                del weight
+                self.detr.load_state_dict(new_weight)
+                del new_weight
+            self.detr = DETRsegm(self.detr, freeze_detr=(frozen_weights != ''))
+            self.seg_postprocess = PostProcessSegm
+
+        self.detr.to(self.device)
+
+        # building criterion
+        matcher = HungarianMatcher(cost_class=1, cost_bbox=l1_weight, cost_giou=giou_weight)
+        weight_dict = {"loss_ce": 1, "loss_bbox": l1_weight}
+        weight_dict["loss_giou"] = giou_weight
+        if deep_supervision:
+            aux_weight_dict = {}
+            for i in range(dec_layers - 1):
+                aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
+            weight_dict.update(aux_weight_dict)
+        losses = ["labels", "boxes", "cardinality"]
+        if self.mask_on:
+            losses += ["masks"]
+        self.criterion = SetCriterion(
+            self.num_classes, matcher=matcher, weight_dict=weight_dict, eos_coef=no_object_weight, losses=losses,
+        )
+        self.criterion.to(self.device)
+
+        pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(3, 1, 1)
+        pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(3, 1, 1)
+        self.normalizer = lambda x: (x - pixel_mean) / pixel_std
+        self.to(self.device)
+        self.onnx_export = False
+    
+    def update_iter(self, i):
+        self.iter = i
+    
+    def preprocess_input(self, x):
+        # x = x.permute(0, 3, 1, 2)
+        # x = F.interpolate(x, size=(640, 640))
+        # x = F.interpolate(x, size=(512, 960))
+        """
+        x is N, CHW aleady permuted
+        """
+        x = [self.normalizer(i) for i in x]
+        return x
+
+    def forward(self, batched_inputs):
+        """
+        Args:
+            batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
+                Each item in the list contains the inputs for one image.
+                For now, each item in the list is a dict that contains:
+                * image: Tensor, image in (C, H, W) format.
+                * instances: Instances
+                Other information that's included in the original dicts, such as:
+                * "height", "width" (int): the output resolution of the model, used in inference.
+                  See :meth:`postprocess` for details.
+        Returns:
+            dict[str: Tensor]:
+                mapping from a named loss to a tensor storing the loss. Used during training only.
+        """
+        if self.onnx_export:
+            logger.info('[WARN] exporting onnx...')
+            assert isinstance(batched_inputs, (list, torch.Tensor)) or isinstance(
+                batched_inputs, list), 'onnx export, batched_inputs only needs image tensor or list of tensors'
+            images = self.preprocess_input(batched_inputs)
+            # batched_inputs = batched_inputs.permute(0, 3, 1, 2)
+            # image_ori_sizes = [batched_inputs.shape[1:3]]
+        else:
+            images = self.preprocess_image(batched_inputs)
+        
+        if self.onnx_export:
+            self.detr.onnx_export = self.onnx_export
+            self.detr.backbone.prepare_onnx_export()
+
+        output = self.detr(images)
+
+        if self.training:
+            gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+
+            targets = self.prepare_targets(gt_instances)
+            loss_dict = self.criterion(output, targets)
+            weight_dict = self.criterion.weight_dict
+            for k in loss_dict.keys():
+                if k in weight_dict:
+                    loss_dict[k] *= weight_dict[k]
+            print(loss_dict)
+            return loss_dict
+        else:
+            if self.onnx_export:
+                box_cls = output[0]
+                box_pred = output[1]
+                scores, labels = F.softmax(box_cls, dim=-1)[:, :, :-1].max(-1)
+                box_pred = box_cxcywh_to_xyxy(box_pred)
+                labels = labels.to(torch.float)
+                # print(scores.shape)
+                # print(scores.unsqueeze(0).shape)
+                a = torch.cat([box_pred, scores.unsqueeze(-1), labels.unsqueeze(-1)], dim=-1)
+                return a
+            else:
+                box_cls = output["pred_logits"]
+                box_pred = output["pred_boxes"]
+                mask_pred = output["pred_masks"] if self.mask_on else None
+                
+                # print(mask_pred.shape)
+                results = self.inference(box_cls, box_pred, mask_pred, images.image_sizes)
+                processed_results = []
+                for results_per_image, input_per_image, image_size in zip(results, batched_inputs, images.image_sizes):
+                    height = input_per_image.get("height", image_size[0])
+                    width = input_per_image.get("width", image_size[1])
+                    r = detector_postprocess(results_per_image, height, width)
+                    processed_results.append({"instances": r})
+                return processed_results
+
+    def prepare_targets(self, targets):
+        new_targets = []
+        for targets_per_image in targets:
+            h, w = targets_per_image.image_size
+            image_size_xyxy = torch.as_tensor([w, h, w, h], dtype=torch.float, device=self.device)
+            gt_classes = targets_per_image.gt_classes
+            gt_boxes = targets_per_image.gt_boxes.tensor / image_size_xyxy
+            gt_boxes = box_xyxy_to_cxcywh(gt_boxes)
+            new_targets.append({"labels": gt_classes, "boxes": gt_boxes})
+            if self.mask_on and hasattr(targets_per_image, 'gt_masks'):
+                gt_masks = targets_per_image.gt_masks
+                gt_masks = convert_coco_poly_to_mask(gt_masks.polygons, h, w)
+                new_targets[-1].update({'masks': gt_masks})
+        return new_targets
+
+    def inference(self, box_cls, box_pred, mask_pred, image_sizes):
+        """
+        Arguments:
+            box_cls (Tensor): tensor of shape (batch_size, num_queries, K).
+                The tensor predicts the classification probability for each query.
+            box_pred (Tensor): tensors of shape (batch_size, num_queries, 4).
+                The tensor predicts 4-vector (x,y,w,h) box
+                regression values for every queryx
+            image_sizes (List[torch.Size]): the input image sizes
+        Returns:
+            results (List[Instances]): a list of #images elements.
+        """
+        assert len(box_cls) == len(image_sizes)
+        results = []
+
+        # For each box we assign the best class or the second best if the best on is `no_object`.
+        scores, labels = F.softmax(box_cls, dim=-1)[:, :, :-1].max(-1)
+
+        for i, (scores_per_image, labels_per_image, box_pred_per_image, image_size) in enumerate(zip(
+            scores, labels, box_pred, image_sizes
+        )):
+            indexes = scores_per_image > self.ignore_thresh
+            scores_per_image = scores_per_image[indexes]
+            labels_per_image = labels_per_image[indexes]
+            box_pred_per_image = box_pred_per_image[indexes]
+
+            result = Instances(image_size)
+            result.pred_boxes = Boxes(box_cxcywh_to_xyxy(box_pred_per_image))
+
+            result.pred_boxes.scale(scale_x=image_size[1], scale_y=image_size[0])
+            if self.mask_on:
+                mask_pred_per_image = mask_pred[i]
+                mask_pred_per_image = mask_pred_per_image[indexes]
+
+                mask = F.interpolate(mask_pred_per_image.unsqueeze(0), size=image_size, mode='bilinear', align_corners=False)
+                mask = mask[0].sigmoid() > 0.5
+                B, N, H, W = mask_pred.shape
+                # print('mask_pred shape: ', mask.shape)
+                # mask = BitMasks(mask.cpu()).crop_and_resize(result.pred_boxes.tensor.cpu(), 32)
+                mask = BitMasks(mask.cpu())
+                # result.pred_masks = mask.unsqueeze(1).to(mask_pred[0].device)
+                result.pred_bit_masks = mask.to(mask_pred_per_image.device)
+            # print('box_pred_per_image: ', box_pred_per_image.shape)
+            result.scores = scores_per_image
+            result.pred_classes = labels_per_image
+            results.append(result)
+        return results
+    
+    def inference_onnx(self, box_cls, box_pred, mask_pred, image_sizes):
+        """
+        appending indices as one of output for convinient select ??
+        """
+        pass
+
+    def preprocess_image(self, batched_inputs):
+        """
+        Normalize, pad and batch the input images.
+        """
+        images = [self.normalizer(x["image"].to(self.device)) for x in batched_inputs]
+        images = ImageList.from_tensors(images)
+        return images
+
+
+class MLP(nn.Module):
+    """ Very simple multi-layer perceptron (also called FFN)"""
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+class MaskedBackbone(nn.Module):
+    """ This is a thin wrapper around D2's backbone to provide padding masking"""
+
+    def __init__(self, cfg):
+        super().__init__()
+        self.backbone = build_backbone(cfg)
+        backbone_shape = self.backbone.output_shape()
+        self.feature_strides = [backbone_shape[f].stride for f in backbone_shape.keys()]
+        self.num_channels = backbone_shape[list(backbone_shape.keys())[-1]].channels
+
+    def forward(self, images):
+        if isinstance(images, ImageList):
+            features = self.backbone(images.tensor)
+            device = images.tensor.device
+        else:
+            features = self.backbone(images.tensors)
+            device = images.tensors.device
+        masks = self.mask_out_padding(
+            [features_per_level.shape for features_per_level in features.values()],
+            images.image_sizes,
+            device,
+        )
+        assert len(features) == len(masks)
+        for i, k in enumerate(features.keys()):
+            features[k] = NestedTensor(features[k], masks[i])
+        return features
+
+    def mask_out_padding(self, feature_shapes, image_sizes, device):
+        masks = []
+        assert len(feature_shapes) == len(self.feature_strides)
+        for idx, shape in enumerate(feature_shapes):
+            N, _, H, W = shape
+            masks_per_feature_level = torch.ones((N, H, W), dtype=torch.bool, device=device)
+            for img_idx, (h, w) in enumerate(image_sizes):
+                masks_per_feature_level[
+                    img_idx,
+                    : int(np.ceil(float(h) / self.feature_strides[idx])),
+                    : int(np.ceil(float(w) / self.feature_strides[idx])),
+                ] = 0
+            masks.append(masks_per_feature_level)
+        return masks
+
+
+class MaskedBackboneTraceFriendly(nn.Module):
+    """ 
+    This is a thin wrapper around D2's backbone to provide padding masking.
+    I change it into tracing friendly with this mask operation.
+    """
+
+    def __init__(self, cfg):
+        super().__init__()
+        self.backbone = build_backbone(cfg)
+        backbone_shape = self.backbone.output_shape()
+        self.feature_strides = [backbone_shape[f].stride for f in backbone_shape.keys()]
+        self.num_channels = backbone_shape[list(backbone_shape.keys())[-1]].channels
+        self.onnx_export = False
+
+    def forward(self, images):
+        if isinstance(images, ImageList):
+            features = self.backbone(images.tensor)
+            device = images.tensor.device
+        else:
+            features = self.backbone(images.tensors)
+            device = images.tensors.device
+
+        if self.onnx_export:
+            logger.info('[onnx export] in MaskedBackbone...')
+            out: Dict[str, NestedTensor] = {}
+            for name, x in features.items():
+                m = images.mask
+                print('m: ', m)
+                print('m: ', m.shape)
+                assert m is not None
+                sp = x.shape[-2:]
+                # mask = F.interpolate(m.to(torch.float), size=sp).to(torch.bool)[0]
+                # mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
+                m = m.unsqueeze(0).float()
+                mask = F.interpolate(m, size=x.shape[-2:]).to(torch.bool)[0]
+                print(mask.shape)
+                out[name] = NestedTensor(x, mask)
+            return out
+        else:
+            masks = self.mask_out_padding(
+                [features_per_level.shape for features_per_level in features.values()],
+                images.image_sizes,
+                device,
+            )
+            assert len(features) == len(masks)
+            for i, k in enumerate(features.keys()):
+                features[k] = NestedTensor(features[k], masks[i])
+            return features
+
+    def mask_out_padding(self, feature_shapes, image_sizes, device):
+        masks = []
+        assert len(feature_shapes) == len(self.feature_strides)
+        for idx, shape in enumerate(feature_shapes):
+            N, _, H, W = shape
+            masks_per_feature_level = torch.ones((N, H, W), dtype=torch.bool, device=device)
+            for img_idx, (h, w) in enumerate(image_sizes):
+                # print('H', H, 'W', W, 'ceil: ', int(np.ceil(float(h) / self.feature_strides[idx])),)
+                masks_per_feature_level[
+                    img_idx,
+                    : int(np.ceil(float(h) / self.feature_strides[idx])),
+                    : int(np.ceil(float(w) / self.feature_strides[idx])),
+                ] = 0
+            masks.append(masks_per_feature_level)
+        return masks
+
+
+class DABDETR(nn.Module):
+    """ This is the DAB-DETR module that performs object detection """
+    def __init__(self, backbone, transformer, num_classes, num_queries, 
+                    aux_loss=False, 
+                    iter_update=True,
+                    query_dim=4, 
+                    bbox_embed_diff_each_layer=False,
+                    random_refpoints_xy=False,
+                    ):
+        """ Initializes the model.
+        Parameters:
+            backbone: torch module of the backbone to be used. See backbone.py
+            transformer: torch module of the transformer architecture. See transformer.py
+            num_classes: number of object classes
+            num_queries: number of object queries, ie detection slot. This is the maximal number of objects
+                         Conditional DETR can detect in a single image. For COCO, we recommend 100 queries.
+            aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
+            iter_update: iterative update of boxes
+            query_dim: query dimension. 2 for point and 4 for box.
+            bbox_embed_diff_each_layer: dont share weights of prediction heads. Default for True.(shared weights.)
+            random_refpoints_xy: random init the x,y of anchor boxes and freeze them. (It sometimes helps to improve the performance)
+            
+
+        """
+        super().__init__()
+        self.num_queries = num_queries
+        self.transformer = transformer
+        hidden_dim = transformer.d_model
+        self.class_embed = nn.Linear(hidden_dim, num_classes)
+        self.bbox_embed_diff_each_layer = bbox_embed_diff_each_layer
+        if bbox_embed_diff_each_layer:
+            self.bbox_embed = nn.ModuleList([MLP(hidden_dim, hidden_dim, 4, 3) for i in range(6)])
+        else:
+            self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
+        
+
+        # setting query dim
+        self.query_dim = query_dim
+        assert query_dim in [2, 4]
+
+        self.refpoint_embed = nn.Embedding(num_queries, query_dim)
+        self.random_refpoints_xy = random_refpoints_xy
+        if random_refpoints_xy:
+            # import ipdb; ipdb.set_trace()
+            self.refpoint_embed.weight.data[:, :2].uniform_(0,1)
+            self.refpoint_embed.weight.data[:, :2] = inverse_sigmoid(self.refpoint_embed.weight.data[:, :2])
+            self.refpoint_embed.weight.data[:, :2].requires_grad = False
+
+        self.input_proj = nn.Conv2d(backbone.num_channels, hidden_dim, kernel_size=1)
+        self.backbone = backbone
+        self.aux_loss = aux_loss
+        self.iter_update = iter_update
+
+        if self.iter_update:
+            self.transformer.decoder.bbox_embed = self.bbox_embed
+
+
+        # init prior_prob setting for focal loss
+        prior_prob = 0.01
+        bias_value = -math.log((1 - prior_prob) / prior_prob)
+        self.class_embed.bias.data = torch.ones(num_classes) * bias_value
+
+        # import ipdb; ipdb.set_trace()
+        # init bbox_embed
+        if bbox_embed_diff_each_layer:
+            for bbox_embed in self.bbox_embed:
+                nn.init.constant_(bbox_embed.layers[-1].weight.data, 0)
+                nn.init.constant_(bbox_embed.layers[-1].bias.data, 0)
+        else:
+            nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0)
+            nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0)
+
+        
+
+    def forward(self, samples: NestedTensor):
+        """ The forward expects a NestedTensor, which consists of:
+               - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
+               - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels
+
+            It returns a dict with the following elements:
+               - "pred_logits": the classification logits (including no-object) for all queries.
+                                Shape= [batch_size x num_queries x num_classes]
+               - "pred_boxes": The normalized boxes coordinates for all queries, represented as
+                               (center_x, center_y, width, height). These values are normalized in [0, 1],
+                               relative to the size of each individual image (disregarding possible padding).
+                               See PostProcess for information on how to retrieve the unnormalized bounding box.
+               - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
+                                dictionnaries containing the two above keys for each decoder layer.
+        """
+        if isinstance(samples, (list, torch.Tensor)):
+            samples = nested_tensor_from_tensor_list(samples)
+        features, pos = self.backbone(samples)
+
+        src, mask = features[-1].decompose()
+        assert mask is not None
+        # default pipeline
+        embedweight = self.refpoint_embed.weight
+        hs, reference = self.transformer(self.input_proj(src), mask, embedweight, pos[-1])
+        
+        if not self.bbox_embed_diff_each_layer:
+            reference_before_sigmoid = inverse_sigmoid(reference)
+            tmp = self.bbox_embed(hs)
+            tmp[..., :self.query_dim] += reference_before_sigmoid
+            outputs_coord = tmp.sigmoid()
+        else:
+            reference_before_sigmoid = inverse_sigmoid(reference)
+            outputs_coords = []
+            for lvl in range(hs.shape[0]):
+                tmp = self.bbox_embed[lvl](hs[lvl])
+                tmp[..., :self.query_dim] += reference_before_sigmoid[lvl]
+                outputs_coord = tmp.sigmoid()
+                outputs_coords.append(outputs_coord)
+            outputs_coord = torch.stack(outputs_coords)
+
+        outputs_class = self.class_embed(hs)
+        out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1]}
+        if self.aux_loss:
+            out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord)
+        return out
+
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_coord):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        return [{'pred_logits': a, 'pred_boxes': b}
+                for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
+
+
+class SetCriterion(nn.Module):
+    """ This class computes the loss for DETR.
+    The process happens in two steps:
+        1) we compute hungarian assignment between ground truth boxes and the outputs of the model
+        2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
+    """
+    def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses):
+        """ Create the criterion.
+        Parameters:
+            num_classes: number of object categories, omitting the special no-object category
+            matcher: module able to compute a matching between targets and proposals
+            weight_dict: dict containing as key the names of the losses and as values their relative weight.
+            eos_coef: relative classification weight applied to the no-object category
+            losses: list of all the losses to be applied. See get_loss for list of available losses.
+        """
+        super().__init__()
+        self.num_classes = num_classes
+        self.matcher = matcher
+        self.weight_dict = weight_dict
+        self.eos_coef = eos_coef
+        self.losses = losses
+        empty_weight = torch.ones(self.num_classes + 1)
+        empty_weight[-1] = self.eos_coef
+        self.register_buffer('empty_weight', empty_weight)
+
+    def loss_labels(self, outputs, targets, indices, num_boxes, log=True):
+        """Classification loss (NLL)
+        targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
+        """
+        assert 'pred_logits' in outputs
+        src_logits = outputs['pred_logits']
+
+        idx = self._get_src_permutation_idx(indices)
+        target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(src_logits.shape[:2], self.num_classes,
+                                    dtype=torch.int64, device=src_logits.device)
+        target_classes[idx] = target_classes_o
+
+        loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight)
+        losses = {'loss_ce': loss_ce}
+
+        if log:
+            # TODO this should probably be a separate loss, not hacked in this one here
+            losses['class_error'] = 100 - accuracy(src_logits[idx], target_classes_o)[0]
+        return losses
+
+    @torch.no_grad()
+    def loss_cardinality(self, outputs, targets, indices, num_boxes):
+        """ Compute the cardinality error, ie the absolute error in the number of predicted non-empty boxes
+        This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients
+        """
+        pred_logits = outputs['pred_logits']
+        device = pred_logits.device
+        tgt_lengths = torch.as_tensor([len(v["labels"]) for v in targets], device=device)
+        # Count the number of predictions that are NOT "no-object" (which is the last class)
+        card_pred = (pred_logits.argmax(-1) != pred_logits.shape[-1] - 1).sum(1)
+        card_err = F.l1_loss(card_pred.float(), tgt_lengths.float())
+        losses = {'cardinality_error': card_err}
+        return losses
+
+    def loss_boxes(self, outputs, targets, indices, num_boxes):
+        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
+           targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
+           The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size.
+        """
+        assert 'pred_boxes' in outputs
+        idx = self._get_src_permutation_idx(indices)
+        src_boxes = outputs['pred_boxes'][idx]
+        target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+
+        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none')
+
+        losses = {}
+        losses['loss_bbox'] = loss_bbox.sum() / num_boxes
+
+        loss_giou = 1 - torch.diag(generalized_box_iou(
+            box_cxcywh_to_xyxy(src_boxes),
+            box_cxcywh_to_xyxy(target_boxes)))
+        losses['loss_giou'] = loss_giou.sum() / num_boxes
+        return losses
+
+    def loss_masks(self, outputs, targets, indices, num_boxes):
+        """Compute the losses related to the masks: the focal loss and the dice loss.
+           targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]
+        """
+        assert "pred_masks" in outputs
+
+        src_idx = self._get_src_permutation_idx(indices)
+        tgt_idx = self._get_tgt_permutation_idx(indices)
+        src_masks = outputs["pred_masks"]
+        src_masks = src_masks[src_idx]
+        masks = [t["masks"] for t in targets]
+        # TODO use valid to mask invalid areas due to padding in loss
+        target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
+        target_masks = target_masks.to(src_masks)
+        target_masks = target_masks[tgt_idx]
+
+        # upsample predictions to the target size
+        src_masks = interpolate(src_masks[:, None], size=target_masks.shape[-2:],
+                                mode="bilinear", align_corners=False)
+        src_masks = src_masks[:, 0].flatten(1)
+
+        target_masks = target_masks.flatten(1)
+        target_masks = target_masks.view(src_masks.shape)
+        losses = {
+            "loss_mask": sigmoid_focal_loss(src_masks, target_masks, num_boxes),
+            "loss_dice": dice_loss(src_masks, target_masks, num_boxes),
+        }
+        return losses
+
+    def _get_src_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        src_idx = torch.cat([src for (src, _) in indices])
+        return batch_idx, src_idx
+
+    def _get_tgt_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
+        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
+        return batch_idx, tgt_idx
+
+    def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs):
+        loss_map = {
+            'labels': self.loss_labels,
+            'cardinality': self.loss_cardinality,
+            'boxes': self.loss_boxes,
+            'masks': self.loss_masks
+        }
+        assert loss in loss_map, f'do you really want to compute {loss} loss?'
+        return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs)
+
+    def forward(self, outputs, targets):
+        """ This performs the loss computation.
+        Parameters:
+             outputs: dict of tensors, see the output specification of the model for the format
+             targets: list of dicts, such that len(targets) == batch_size.
+                      The expected keys in each dict depends on the losses applied, see each loss' doc
+        """
+        outputs_without_aux = {k: v for k, v in outputs.items() if k != 'aux_outputs'}
+
+        # Retrieve the matching between the outputs of the last layer and the targets
+        indices = self.matcher(outputs_without_aux, targets)
+
+        # Compute the average number of target boxes accross all nodes, for normalization purposes
+        num_boxes = sum(len(t["labels"]) for t in targets)
+        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
+        if is_dist_avail_and_initialized():
+            torch.distributed.all_reduce(num_boxes)
+        num_boxes = torch.clamp(num_boxes / comm.get_world_size(), min=1).item()
+
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))
+
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if 'aux_outputs' in outputs:
+            for i, aux_outputs in enumerate(outputs['aux_outputs']):
+                indices = self.matcher(aux_outputs, targets)
+                for loss in self.losses:
+                    if loss == 'masks':
+                        # Intermediate masks losses are too costly to compute, we ignore them.
+                        continue
+                    kwargs = {}
+                    if loss == 'labels':
+                        # Logging is enabled only for the last layer
+                        kwargs = {'log': False}
+                    l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_boxes, **kwargs)
+                    l_dict = {k + f'_{i}': v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+
+        return losses
+
+
+class PostProcess(nn.Module):
+    """ This module converts the model's output into the format expected by the coco api"""
+    @torch.no_grad()
+    def forward(self, outputs, target_sizes):
+        """ Perform the computation
+        Parameters:
+            outputs: raw outputs of the model
+            target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch
+                          For evaluation, this must be the original image size (before any data augmentation)
+                          For visualization, this should be the image size after data augment, but before padding
+        """
+        out_logits, out_bbox = outputs['pred_logits'], outputs['pred_boxes']
+
+        assert len(out_logits) == len(target_sizes)
+        assert target_sizes.shape[1] == 2
+
+        prob = F.softmax(out_logits, -1)
+        scores, labels = prob[..., :-1].max(-1)
+
+        # convert to [x0, y0, x1, y1] format
+        boxes = box_ops.box_cxcywh_to_xyxy(out_bbox)
+        # and from relative [0, 1] to absolute [0, height] coordinates
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+        boxes = boxes * scale_fct[:, None, :]
+
+        results = [{'scores': s, 'labels': l, 'boxes': b} for s, l, b in zip(scores, labels, boxes)]
+
+        return results
+
+
+
+
diff --git a/yolov7/modeling/meta_arch/detr.py b/yolov7/modeling/meta_arch/detr.py
old mode 100755
new mode 100644
index 70db509..6ab54d0
--- a/yolov7/modeling/meta_arch/detr.py
+++ b/yolov7/modeling/meta_arch/detr.py
@@ -24,7 +24,7 @@
 from alfred.utils.log import logger
 
 from ..backbone.detr_backbone import Joiner, PositionEmbeddingSine, Transformer
-from .detr_seg import DETRsegm, PostProcessPanoptic, PostProcessSegm
+from .detr_seg import DETRsegm, PostProcessPanoptic, PostProcessSegm, sigmoid_focal_loss, dice_loss
 
 
 __all__ = ["Detr"]
@@ -173,6 +173,7 @@ def forward(self, batched_inputs):
             for k in loss_dict.keys():
                 if k in weight_dict:
                     loss_dict[k] *= weight_dict[k]
+            print(loss_dict)
             return loss_dict
         else:
             if self.onnx_export:
@@ -181,7 +182,7 @@ def forward(self, batched_inputs):
                 scores, labels = F.softmax(box_cls, dim=-1)[:, :, :-1].max(-1)
                 box_pred = box_cxcywh_to_xyxy(box_pred)
                 labels = labels.to(torch.float)
-                print(scores.shape)
+                # print(scores.shape)
                 # print(scores.unsqueeze(0).shape)
                 a = torch.cat([box_pred, scores.unsqueeze(-1), labels.unsqueeze(-1)], dim=-1)
                 return a
@@ -189,6 +190,8 @@ def forward(self, batched_inputs):
                 box_cls = output["pred_logits"]
                 box_pred = output["pred_boxes"]
                 mask_pred = output["pred_masks"] if self.mask_on else None
+                
+                # print(mask_pred.shape)
                 results = self.inference(box_cls, box_pred, mask_pred, images.image_sizes)
                 processed_results = []
                 for results_per_image, input_per_image, image_size in zip(results, batched_inputs, images.image_sizes):
@@ -244,12 +247,18 @@ def inference(self, box_cls, box_pred, mask_pred, image_sizes):
 
             result.pred_boxes.scale(scale_x=image_size[1], scale_y=image_size[0])
             if self.mask_on:
-                mask = F.interpolate(mask_pred[i].unsqueeze(0), size=image_size, mode='bilinear', align_corners=False)
+                mask_pred_per_image = mask_pred[i]
+                mask_pred_per_image = mask_pred_per_image[indexes]
+
+                mask = F.interpolate(mask_pred_per_image.unsqueeze(0), size=image_size, mode='bilinear', align_corners=False)
                 mask = mask[0].sigmoid() > 0.5
                 B, N, H, W = mask_pred.shape
-                mask = BitMasks(mask.cpu()).crop_and_resize(result.pred_boxes.tensor.cpu(), 32)
-                result.pred_masks = mask.unsqueeze(1).to(mask_pred[0].device)
-
+                # print('mask_pred shape: ', mask.shape)
+                # mask = BitMasks(mask.cpu()).crop_and_resize(result.pred_boxes.tensor.cpu(), 32)
+                mask = BitMasks(mask.cpu())
+                # result.pred_masks = mask.unsqueeze(1).to(mask_pred[0].device)
+                result.pred_bit_masks = mask.to(mask_pred_per_image.device)
+            # print('box_pred_per_image: ', box_pred_per_image.shape)
             result.scores = scores_per_image
             result.pred_classes = labels_per_image
             results.append(result)
@@ -438,7 +447,6 @@ def forward(self, samples: NestedTensor):
             # print('samples: ', samples.shape)
         # print(samples, type(samples))
         features, pos = self.backbone(samples)
-        # print(features, 'features')
 
         src, mask = features[-1].decompose()
         assert mask is not None
diff --git a/yolov7/modeling/meta_arch/detr_d2go.py b/yolov7/modeling/meta_arch/detr_d2go.py
new file mode 100644
index 0000000..37e8820
--- /dev/null
+++ b/yolov7/modeling/meta_arch/detr_d2go.py
@@ -0,0 +1,605 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from typing import Dict
+import numpy as np
+import torch
+import torch.nn.functional as F
+from detectron2.modeling import META_ARCH_REGISTRY, build_backbone, detector_postprocess
+from detectron2.structures import Boxes, ImageList, Instances, BitMasks
+from yolov7.utils.boxes import convert_coco_poly_to_mask
+from ..backbone.smcadetr_backbone import Joiner
+from ..backbone.detr_backbone import Transformer as DetrTransformer
+from ..backbone.smcadetr_backbone import Transformer as SMCATransformer
+from ..backbone.detr_backbone import PositionEmbeddingSine, MLP
+from yolov7.utils.detr_utils import HungarianMatcherD2go
+
+from yolov7.modeling.loss.setcriterion import SetCriterion, FocalLossSetCriterion
+from yolov7.utils.misc import NestedTensor, nested_tensor_from_tensor_list
+from yolov7.utils.boxes import box_cxcywh_to_xyxy, box_xyxy_to_cxcywh
+from .detr_seg import DETRsegm, PostProcessSegm
+from .smca_detr import DETR as SMCADETR
+from torch import nn
+from alfred.utils.log import logger
+
+
+__all__ = ["DetrD2go"]
+
+
+class ResNetMaskedBackbone(nn.Module):
+    """This is a thin wrapper around D2's backbone to provide padding masking"""
+
+    def __init__(self, cfg):
+        super().__init__()
+        self.backbone = build_backbone(cfg)
+        backbone_shape = self.backbone.output_shape()
+        if cfg.MODEL.DETR.NUM_FEATURE_LEVELS > 1:
+            self.strides = [8, 16, 32]
+        else:
+            self.strides = [32]
+
+        if cfg.MODEL.RESNETS.RES5_DILATION == 2:
+            # fix dilation from d2
+            self.backbone.stages[-1][0].conv2.dilation = (1, 1)
+            self.backbone.stages[-1][0].conv2.padding = (1, 1)
+            self.strides[-1] = self.strides[-1] // 2
+
+        self.feature_strides = [
+            backbone_shape[f].stride for f in backbone_shape.keys()]
+        self.num_channels = [
+            backbone_shape[k].channels for k in backbone_shape.keys()]
+
+    def forward(self, images):
+        if isinstance(images, ImageList):
+            features = self.backbone(images.tensor)
+            device = images.tensor.device
+        else:
+            features = self.backbone(images.tensors)
+            device = images.tensors.device
+
+        if torch.onnx.is_in_onnx_export():
+            logger.info('[onnx export] in MaskedBackbone...')
+            out: Dict[str, NestedTensor] = {}
+            for name, x in features.items():
+                m = images.mask
+                print('m: ', m)
+                print('m: ', m.shape)
+                assert m is not None
+                sp = x.shape[-2:]
+                # mask = F.interpolate(m.to(torch.float), size=sp).to(torch.bool)[0]
+                # mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
+                m = m.unsqueeze(0).float()
+                mask = F.interpolate(m, size=x.shape[-2:]).to(torch.bool)[0]
+                print(mask.shape)
+                out[name] = NestedTensor(x, mask)
+            return out
+        else:
+            features = self.backbone(images.tensor)
+            # one tensor per feature level. Each tensor has shape (B, maxH, maxW)
+            masks = self.mask_out_padding(
+                [features_per_level.shape for features_per_level in features.values()],
+                images.image_sizes,
+                images.tensor.device,
+            )
+            assert len(features) == len(masks)
+            for i, k in enumerate(features.keys()):
+                features[k] = NestedTensor(features[k], masks[i])
+            return features
+
+    def mask_out_padding(self, feature_shapes, image_sizes, device):
+        masks = []
+        assert len(feature_shapes) == len(self.feature_strides)
+        for idx, shape in enumerate(feature_shapes):
+            N, _, H, W = shape
+            masks_per_feature_level = torch.ones(
+                (N, H, W), dtype=torch.bool, device=device
+            )
+            for img_idx, (h, w) in enumerate(image_sizes):
+                masks_per_feature_level[
+                    img_idx,
+                    : int(np.ceil(float(h) / self.feature_strides[idx])),
+                    : int(np.ceil(float(w) / self.feature_strides[idx])),
+                ] = 0
+            masks.append(masks_per_feature_level)
+        return masks
+
+
+class FBNetMaskedBackbone(ResNetMaskedBackbone):
+    """This is a thin wrapper around D2's backbone to provide padding masking"""
+
+    def __init__(self, cfg):
+        nn.Module.__init__(self)
+        self.backbone = build_backbone(cfg)
+        self.out_features = cfg.MODEL.FBNET_V2.OUT_FEATURES
+        self.feature_strides = list(
+            self.backbone._out_feature_strides.values())
+        self.num_channels = [
+            self.backbone._out_feature_channels[k] for k in self.out_features
+        ]
+        self.strides = [
+            self.backbone._out_feature_strides[k] for k in self.out_features
+        ]
+
+    def forward(self, images):
+        features = self.backbone(images.tensor)
+        masks = self.mask_out_padding(
+            [features_per_level.shape for features_per_level in features.values()],
+            images.image_sizes,
+            images.tensor.device,
+        )
+        assert len(features) == len(masks)
+        ret_features = {}
+        for i, k in enumerate(features.keys()):
+            if k in self.out_features:
+                ret_features[k] = NestedTensor(features[k], masks[i])
+        return ret_features
+
+
+class SimpleSingleStageBackbone(ResNetMaskedBackbone):
+    """This is a simple wrapper for single stage backbone,
+    please set the required configs:
+    cfg.MODEL.BACKBONE.SIMPLE == True,
+    cfg.MODEL.BACKBONE.STRIDE, cfg.MODEL.BACKBONE.CHANNEL
+    """
+
+    def __init__(self, cfg):
+        nn.Module.__init__(self)
+        self.backbone = build_backbone(cfg)
+        self.out_features = ["out"]
+        assert cfg.MODEL.BACKBONE.SIMPLE is True
+        self.feature_strides = [cfg.MODEL.BACKBONE.STRIDE]
+        self.num_channels = [cfg.MODEL.BACKBONE.CHANNEL]
+        self.strides = [cfg.MODEL.BACKBONE.STRIDE]
+
+    def forward(self, images):
+        # print(images.tensor.shape)
+        y = self.backbone(images.tensor)
+        if isinstance(y, Dict):
+            y = list(y.values())[-1]
+            # print(y.shape)
+        masks = self.mask_out_padding(
+            [y.shape],
+            images.image_sizes,
+            images.tensor.device,
+        )
+        assert len(masks) == 1
+        ret_features = {}
+        ret_features[self.out_features[0]] = NestedTensor(y, masks[0])
+        return ret_features
+
+
+@META_ARCH_REGISTRY.register()
+class DetrD2go(nn.Module):
+    """
+    Implement Detr
+    """
+
+    def __init__(self, cfg):
+        super().__init__()
+
+        self.device = torch.device(cfg.MODEL.DEVICE)
+
+        self.num_classes = cfg.MODEL.DETR.NUM_CLASSES
+        self.mask_on = cfg.MODEL.MASK_ON
+        hidden_dim = cfg.MODEL.DETR.HIDDEN_DIM
+        num_queries = cfg.MODEL.DETR.NUM_OBJECT_QUERIES
+        # Transformer parameters:
+        nheads = cfg.MODEL.DETR.NHEADS
+        dropout = cfg.MODEL.DETR.DROPOUT
+        dim_feedforward = cfg.MODEL.DETR.DIM_FEEDFORWARD
+        enc_layers = cfg.MODEL.DETR.ENC_LAYERS
+        dec_layers = cfg.MODEL.DETR.DEC_LAYERS
+        pre_norm = cfg.MODEL.DETR.PRE_NORM
+
+        # Loss parameters:
+        giou_weight = cfg.MODEL.DETR.GIOU_WEIGHT
+        l1_weight = cfg.MODEL.DETR.L1_WEIGHT
+        cls_weight = cfg.MODEL.DETR.CLS_WEIGHT
+        deep_supervision = cfg.MODEL.DETR.DEEP_SUPERVISION
+        no_object_weight = cfg.MODEL.DETR.NO_OBJECT_WEIGHT
+        centered_position_encoding = cfg.MODEL.DETR.CENTERED_POSITION_ENCODIND
+        num_feature_levels = cfg.MODEL.DETR.NUM_FEATURE_LEVELS
+
+        N_steps = hidden_dim // 2
+        if "resnet" in cfg.MODEL.BACKBONE.NAME.lower():
+            d2_backbone = ResNetMaskedBackbone(cfg)
+        elif "fbnet" in cfg.MODEL.BACKBONE.NAME.lower():
+            d2_backbone = FBNetMaskedBackbone(cfg)
+        elif cfg.MODEL.BACKBONE.SIMPLE:
+            d2_backbone = SimpleSingleStageBackbone(cfg)
+        else:
+            raise NotImplementedError
+
+        backbone = Joiner(
+            d2_backbone,
+            PositionEmbeddingSine(
+                N_steps, normalize=True, centered=centered_position_encoding
+            ),
+        )
+        backbone.num_channels = d2_backbone.num_channels
+        self.use_focal_loss = cfg.MODEL.DETR.USE_FOCAL_LOSS
+
+        if cfg.MODEL.DETR.ATTENTION_TYPE == 'deformable':
+            print('Deformable not supported now.')
+        elif cfg.MODEL.DETR.ATTENTION_TYPE == 'DETR':
+            transformer = DetrTransformer(
+                d_model=hidden_dim,
+                dropout=dropout,
+                nhead=nheads,
+                dim_feedforward=dim_feedforward,
+                num_encoder_layers=enc_layers,
+                num_decoder_layers=dec_layers,
+                normalize_before=pre_norm,
+                return_intermediate_dec=deep_supervision,
+            )
+            self.detr = DETR(
+                backbone,
+                transformer,
+                num_classes=self.num_classes,
+                num_queries=num_queries,
+                aux_loss=deep_supervision,
+                use_focal_loss=self.use_focal_loss,
+            )
+        elif cfg.MODEL.DETR.ATTENTION_TYPE == 'SMCA':
+            transformer = SMCATransformer(
+                d_model=hidden_dim,
+                dropout=dropout,
+                nhead=nheads,
+                dim_feedforward=dim_feedforward,
+                num_encoder_layers=enc_layers,
+                num_decoder_layers=dec_layers,
+                normalize_before=pre_norm,
+                return_intermediate_dec=deep_supervision,
+                dynamic_scale='type3'
+            )
+            self.detr = SMCADETR(
+                backbone,
+                transformer,
+                num_classes=self.num_classes,
+                num_queries=num_queries,
+                aux_loss=deep_supervision,
+                num_feature_levels=num_feature_levels,
+                use_focal_loss=self.use_focal_loss,
+            )
+        else:
+            logger.error(
+                f'attention type: {cfg.MODEL.DETR.ATTENTION_TYPE} not support')
+            exit(1)
+
+        if self.mask_on:
+            frozen_weights = cfg.MODEL.DETR.FROZEN_WEIGHTS
+            if frozen_weights != "":
+                print("LOAD pre-trained weights")
+                weight = torch.load(
+                    frozen_weights, map_location=lambda storage, loc: storage
+                )["model"]
+                new_weight = {}
+                for k, v in weight.items():
+                    if "detr." in k:
+                        new_weight[k.replace("detr.", "")] = v
+                    else:
+                        print(f"Skipping loading weight {k} from frozen model")
+                del weight
+                self.detr.load_state_dict(new_weight)
+                del new_weight
+            self.detr = DETRsegm(self.detr, freeze_detr=(frozen_weights != ""))
+            self.seg_postprocess = PostProcessSegm
+
+        self.detr.to(self.device)
+
+        # building criterion
+        matcher = HungarianMatcherD2go(
+            cost_class=cls_weight,
+            cost_bbox=l1_weight,
+            cost_giou=giou_weight,
+            use_focal_loss=self.use_focal_loss,
+        )
+        weight_dict = {"loss_ce": cls_weight, "loss_bbox": l1_weight}
+        weight_dict["loss_giou"] = giou_weight
+        if deep_supervision:
+            aux_weight_dict = {}
+            for i in range(dec_layers - 1):
+                aux_weight_dict.update(
+                    {k + f"_{i}": v for k, v in weight_dict.items()})
+            weight_dict.update(aux_weight_dict)
+        losses = ["labels", "boxes", "cardinality"]
+        if self.mask_on:
+            losses += ["masks"]
+        if self.use_focal_loss:
+            self.criterion = FocalLossSetCriterion(
+                self.num_classes,
+                matcher=matcher,
+                weight_dict=weight_dict,
+                losses=losses,
+            )
+        else:
+            self.criterion = SetCriterion(
+                self.num_classes,
+                matcher=matcher,
+                weight_dict=weight_dict,
+                eos_coef=no_object_weight,
+                losses=losses,
+            )
+        self.criterion.to(self.device)
+
+        pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(
+            self.device).view(3, 1, 1)
+        pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(
+            self.device).view(3, 1, 1)
+        self.normalizer = lambda x: (x - pixel_mean) / pixel_std
+        self.to(self.device)
+
+    def preprocess_input(self, x):
+        # x = x.permute(0, 3, 1, 2)
+        # x = F.interpolate(x, size=(640, 640))
+        # x = F.interpolate(x, size=(512, 960))
+        """
+        x is N, CHW aleady permuted
+        """
+        x = [self.normalizer(i) for i in x]
+        return x
+
+    def forward(self, batched_inputs):
+        """
+        Args:
+            batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
+                Each item in the list contains the inputs for one image.
+                For now, each item in the list is a dict that contains:
+
+                * image: Tensor, image in (C, H, W) format.
+                * instances: Instances
+
+                Other information that's included in the original dicts, such as:
+
+                * "height", "width" (int): the output resolution of the model, used in inference.
+                  See :meth:`postprocess` for details.
+        Returns:
+            dict[str: Tensor]:
+                mapping from a named loss to a tensor storing the loss. Used during training only.
+        """
+        if torch.onnx.is_in_onnx_export():
+            logger.info('[WARN] exporting onnx...')
+            assert isinstance(batched_inputs, (list, torch.Tensor)) or isinstance(
+                batched_inputs, list), 'onnx export, batched_inputs only needs image tensor or list of tensors'
+            images = self.preprocess_input(batched_inputs)
+            # batched_inputs = batched_inputs.permute(0, 3, 1, 2)
+            # image_ori_sizes = [batched_inputs.shape[1:3]]
+        else:
+            images = self.preprocess_image(batched_inputs)
+        output = self.detr(images)
+
+        if self.training:
+            gt_instances = [x["instances"].to(
+                self.device) for x in batched_inputs]
+
+            # targets: List[Dict[str, torch.Tensor]]. Keys
+            # "labels": [NUM_BOX,]
+            # "boxes": [NUM_BOX, 4]
+            targets = self.prepare_targets(gt_instances)
+            loss_dict = self.criterion(output, targets)
+            weight_dict = self.criterion.weight_dict
+            for k in loss_dict.keys():
+                if not loss_dict[k].requires_grad:
+                    loss_dict[k] = loss_dict[k].new_tensor(0)
+                if k in weight_dict:
+                    loss_dict[k] *= weight_dict[k]
+            # print(loss_dict)
+            return loss_dict
+        else:
+            if torch.onnx.is_in_onnx_export():
+                box_cls = output["pred_logits"]
+                box_pred = output["pred_boxes"]
+                if self.use_focal_loss:
+                    prob = box_cls.sigmoid()
+                    # TODO make top-100 as an option for non-focal-loss as well
+                    scores, topk_indexes = torch.topk(
+                        prob.view(box_cls.shape[0], -1), 100, dim=1
+                    )
+                    topk_boxes = topk_indexes // box_cls.shape[2]
+                    labels = topk_indexes % box_cls.shape[2]
+
+                    print('topk_boxes: ', topk_boxes.shape)
+                    print('box_pred: ', box_pred.shape)
+                    topk_boxes = topk_boxes.to(torch.int64)
+                    topk_boxes = topk_boxes.unsqueeze(-1).repeat(1, 1, 4)
+                    print('topk_boxes: ', topk_boxes.shape)
+                    box_pred = torch.gather(box_pred, 1, topk_boxes)
+                else:
+                    scores, labels = F.softmax(
+                        box_cls, dim=-1)[:, :, :-1].max(-1)
+                box_pred = box_cxcywh_to_xyxy(box_pred)
+                labels = labels.to(torch.float)
+                # print(scores.shape)
+                # print(scores.unsqueeze(0).shape)
+                a = torch.cat([box_pred, scores.unsqueeze(-1),
+                              labels.unsqueeze(-1)], dim=-1)
+                return a
+            else:
+                box_cls = output["pred_logits"]
+                box_pred = output["pred_boxes"]
+                mask_pred = output["pred_masks"] if self.mask_on else None
+                results = self.inference(
+                    box_cls, box_pred, mask_pred, images.image_sizes)
+                processed_results = []
+                for results_per_image, input_per_image, image_size in zip(
+                    results, batched_inputs, images.image_sizes
+                ):
+                    height = input_per_image.get("height", image_size[0])
+                    width = input_per_image.get("width", image_size[1])
+                    r = detector_postprocess(results_per_image, height, width)
+                    processed_results.append({"instances": r})
+                return processed_results
+
+    def prepare_targets(self, targets):
+        new_targets = []
+        for targets_per_image in targets:
+            h, w = targets_per_image.image_size
+            image_size_xyxy = torch.as_tensor(
+                [w, h, w, h], dtype=torch.float, device=self.device
+            )
+            gt_classes = targets_per_image.gt_classes  # shape (NUM_BOX,)
+            gt_boxes = targets_per_image.gt_boxes.tensor / image_size_xyxy
+            gt_boxes = box_xyxy_to_cxcywh(gt_boxes)  # shape (NUM_BOX, 4)
+            new_targets.append({"labels": gt_classes, "boxes": gt_boxes})
+            if self.mask_on and hasattr(targets_per_image, "gt_masks"):
+                gt_masks = targets_per_image.gt_masks
+                gt_masks = convert_coco_poly_to_mask(gt_masks.polygons, h, w)
+                new_targets[-1].update({"masks": gt_masks})
+        return new_targets
+
+    def inference(self, box_cls, box_pred, mask_pred, image_sizes):
+        """
+        Arguments:
+            box_cls (Tensor): tensor of shape (batch_size, num_queries, K).
+                The tensor predicts the classification probability for each query.
+            box_pred (Tensor): tensors of shape (batch_size, num_queries, 4).
+                The tensor predicts 4-vector (x,y,w,h) box
+                regression values for every queryx
+            image_sizes (List[torch.Size]): the input image sizes
+
+        Returns:
+            results (List[Instances]): a list of #images elements.
+        """
+        assert len(box_cls) == len(image_sizes)
+        results = []
+
+        # For each box we assign the best class or the second best if the best on is `no_object`.
+        if self.use_focal_loss:
+            prob = box_cls.sigmoid()
+            # TODO make top-100 as an option for non-focal-loss as well
+            scores, topk_indexes = torch.topk(
+                prob.view(box_cls.shape[0], -1), 100, dim=1
+            )
+            topk_boxes = topk_indexes // box_cls.shape[2]
+            labels = topk_indexes % box_cls.shape[2]
+        else:
+            scores, labels = F.softmax(box_cls, dim=-1)[:, :, :-1].max(-1)
+
+        for i, (
+            scores_per_image,
+            labels_per_image,
+            box_pred_per_image,
+            image_size,
+        ) in enumerate(zip(scores, labels, box_pred, image_sizes)):
+            result = Instances(image_size)
+            boxes = box_cxcywh_to_xyxy(box_pred_per_image)
+            if self.use_focal_loss:
+                boxes = torch.gather(
+                    boxes, 0, topk_boxes[i].unsqueeze(-1).repeat(1, 4))
+
+            result.pred_boxes = Boxes(boxes)
+            result.pred_boxes.scale(
+                scale_x=image_size[1], scale_y=image_size[0])
+            if self.mask_on:
+                mask = F.interpolate(
+                    mask_pred[i].unsqueeze(0),
+                    size=image_size,
+                    mode="bilinear",
+                    align_corners=False,
+                )
+                mask = mask[0].sigmoid() > 0.5
+                B, N, H, W = mask_pred.shape
+                mask = BitMasks(mask.cpu()).crop_and_resize(
+                    result.pred_boxes.tensor.cpu(), 32
+                )
+                result.pred_masks = mask.unsqueeze(1).to(mask_pred[0].device)
+
+            result.scores = scores_per_image
+            result.pred_classes = labels_per_image
+            results.append(result)
+        return results
+
+    def preprocess_image(self, batched_inputs):
+        """
+        Normalize, pad and batch the input images.
+        """
+        images = [self.normalizer(x["image"].to(self.device))
+                  for x in batched_inputs]
+        images = ImageList.from_tensors(images)
+        return images
+
+
+class DETR(nn.Module):
+    """This is the DETR module that performs object detection"""
+
+    def __init__(
+        self,
+        backbone,
+        transformer,
+        num_classes,
+        num_queries,
+        aux_loss=False,
+        use_focal_loss=False,
+    ):
+        """Initializes the model.
+        Parameters:
+            backbone: torch module of the backbone to be used. See backbone.py
+            transformer: torch module of the transformer architecture. See transformer.py
+            num_classes: number of object classes
+            num_queries: number of object queries, ie detection slot. This is the maximal number of objects
+                         DETR can detect in a single image. For COCO, we recommend 100 queries.
+            aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
+        """
+        super().__init__()
+        self.num_queries = num_queries
+        self.transformer = transformer
+        hidden_dim = transformer.d_model
+        self.class_embed = nn.Linear(
+            hidden_dim, num_classes if use_focal_loss else num_classes + 1
+        )
+        self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
+        self.query_embed = nn.Embedding(num_queries, hidden_dim)
+        self.input_proj = nn.Conv2d(
+            backbone.num_channels[-1], hidden_dim, kernel_size=1
+        )
+        self.backbone = backbone
+        self.aux_loss = aux_loss
+
+    def forward(self, samples: NestedTensor):
+        """The forward expects a NestedTensor, which consists of:
+           - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
+           - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels
+
+        It returns a dict with the following elements:
+           - "pred_logits": the classification logits (including no-object) for all queries.
+                            Shape= [batch_size x num_queries x (num_classes + 1)]
+           - "pred_boxes": The normalized boxes coordinates for all queries, represented as
+                           (center_x, center_y, height, width). These values are normalized in [0, 1],
+                           relative to the size of each individual image (disregarding possible padding).
+                           See PostProcess for information on how to retrieve the unnormalized bounding box.
+           - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
+                            dictionnaries containing the two above keys for each decoder layer.
+        """
+        if isinstance(samples, (list, torch.Tensor)):
+            samples = nested_tensor_from_tensor_list(samples)
+        features, pos = self.backbone(samples)
+
+        # src shape (B, C, H, W)
+        # mask shape (B, H, W)
+        src, mask = features[-1].decompose()
+        assert mask is not None
+        # hs shape (NUM_LAYER, B, S, hidden_dim)
+        hs = self.transformer(
+            self.input_proj(src), mask, self.query_embed.weight, pos[-1]
+        )[0]
+        # shape (NUM_LAYER, B, S, NUM_CLASS + 1)
+        outputs_class = self.class_embed(hs)
+        # shape (NUM_LAYER, B, S, 4)
+        outputs_coord = self.bbox_embed(hs).sigmoid()
+        # pred_logits shape (B, S, NUM_CLASS + 1)
+        # pred_boxes shape (B, S, 4)
+        out = {"pred_logits": outputs_class[-1],
+               "pred_boxes": outputs_coord[-1]}
+        if self.aux_loss:
+            out["aux_outputs"] = self._set_aux_loss(
+                outputs_class, outputs_coord)
+        return out
+
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_coord):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        return [
+            {"pred_logits": a, "pred_boxes": b}
+            for a, b in zip(outputs_class[:-1], outputs_coord[:-1])
+        ]
diff --git a/yolov7/modeling/meta_arch/detr_seg.py b/yolov7/modeling/meta_arch/detr_seg.py
old mode 100755
new mode 100644
index d2570de..0c52fa1
--- a/yolov7/modeling/meta_arch/detr_seg.py
+++ b/yolov7/modeling/meta_arch/detr_seg.py
@@ -31,8 +31,10 @@ def __init__(self, detr, freeze_detr=False):
                 p.requires_grad_(False)
 
         hidden_dim, nheads = detr.transformer.d_model, detr.transformer.nhead
-        self.bbox_attention = MHAttentionMap(hidden_dim, hidden_dim, nheads, dropout=0.0)
-        self.mask_head = MaskHeadSmallConv(hidden_dim + nheads, [1024, 512, 256], hidden_dim)
+        self.bbox_attention = MHAttentionMap(
+            hidden_dim, hidden_dim, nheads, dropout=0.0)
+        self.mask_head = MaskHeadSmallConv(
+            hidden_dim + nheads, [1024, 512, 256], hidden_dim)
 
     def forward(self, samples: NestedTensor):
         if isinstance(samples, (list, torch.Tensor)):
@@ -44,20 +46,29 @@ def forward(self, samples: NestedTensor):
         src, mask = features[-1].decompose()
         assert mask is not None
         src_proj = self.detr.input_proj(src)
-        hs, memory = self.detr.transformer(src_proj, mask, self.detr.query_embed.weight, pos[-1])
+        hs, memory = self.detr.transformer(
+            src_proj, mask, self.detr.query_embed.weight, pos[-1])
 
         outputs_class = self.detr.class_embed(hs)
         outputs_coord = self.detr.bbox_embed(hs).sigmoid()
-        out = {"pred_logits": outputs_class[-1], "pred_boxes": outputs_coord[-1]}
+        out = {"pred_logits": outputs_class[-1],
+               "pred_boxes": outputs_coord[-1]}
         if self.detr.aux_loss:
-            out['aux_outputs'] = self.detr._set_aux_loss(outputs_class, outputs_coord)
+            out['aux_outputs'] = self.detr._set_aux_loss(
+                outputs_class, outputs_coord)
 
         # FIXME h_boxes takes the last one computed, keep this in mind
         bbox_mask = self.bbox_attention(hs[-1], memory, mask=mask)
 
-        seg_masks = self.mask_head(src_proj, bbox_mask, [features[2].tensors, features[1].tensors, features[0].tensors])
-        outputs_seg_masks = seg_masks.view(bs, self.detr.num_queries, seg_masks.shape[-2], seg_masks.shape[-1])
+        print('box_mask: ', bbox_mask.shape)
+        print('outputs_coord: ', outputs_coord.shape)
 
+        seg_masks = self.mask_head(src_proj, bbox_mask, [
+                                   features[2].tensors, features[1].tensors, features[0].tensors])
+        outputs_seg_masks = seg_masks.view(
+            bs, self.detr.num_queries, seg_masks.shape[-2], seg_masks.shape[-1])
+
+        print('outputs_seg_masks: ', outputs_seg_masks.shape)
         out["pred_masks"] = outputs_seg_masks
         return out
 
@@ -75,7 +86,8 @@ class MaskHeadSmallConv(nn.Module):
     def __init__(self, dim, fpn_dims, context_dim):
         super().__init__()
 
-        inter_dims = [dim, context_dim // 2, context_dim // 4, context_dim // 8, context_dim // 16, context_dim // 64]
+        inter_dims = [dim, context_dim // 2, context_dim // 4,
+                      context_dim // 8, context_dim // 16, context_dim // 64]
         self.lay1 = torch.nn.Conv2d(dim, dim, 3, padding=1)
         self.gn1 = torch.nn.GroupNorm(8, dim)
         self.lay2 = torch.nn.Conv2d(dim, inter_dims[1], 3, padding=1)
@@ -100,7 +112,8 @@ def __init__(self, dim, fpn_dims, context_dim):
                 nn.init.constant_(m.bias, 0)
 
     def forward(self, x: Tensor, bbox_mask: Tensor, fpns: List[Tensor]):
-        x = torch.cat([_expand(x, bbox_mask.shape[1]), bbox_mask.flatten(0, 1)], 1)
+        x = torch.cat([_expand(x, bbox_mask.shape[1]),
+                      bbox_mask.flatten(0, 1)], 1)
 
         x = self.lay1(x)
         x = self.gn1(x)
@@ -157,10 +170,14 @@ def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0, bias=True):
 
     def forward(self, q, k, mask: Optional[Tensor] = None):
         q = self.q_linear(q)
-        k = F.conv2d(k, self.k_linear.weight.unsqueeze(-1).unsqueeze(-1), self.k_linear.bias)
-        qh = q.view(q.shape[0], q.shape[1], self.num_heads, self.hidden_dim // self.num_heads)
-        kh = k.view(k.shape[0], self.num_heads, self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1])
-        weights = torch.einsum("bqnc,bnchw->bqnhw", qh * self.normalize_fact, kh)
+        k = F.conv2d(
+            k, self.k_linear.weight.unsqueeze(-1).unsqueeze(-1), self.k_linear.bias)
+        qh = q.view(q.shape[0], q.shape[1], self.num_heads,
+                    self.hidden_dim // self.num_heads)
+        kh = k.view(k.shape[0], self.num_heads, self.hidden_dim //
+                    self.num_heads, k.shape[-2], k.shape[-1])
+        weights = torch.einsum("bqnc,bnchw->bqnhw",
+                               qh * self.normalize_fact, kh)
 
         if mask is not None:
             weights.masked_fill_(mask.unsqueeze(1).unsqueeze(1), float("-inf"))
@@ -204,7 +221,8 @@ def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: f
         Loss tensor
     """
     prob = inputs.sigmoid()
-    ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    ce_loss = F.binary_cross_entropy_with_logits(
+        inputs, targets, reduction="none")
     p_t = prob * targets + (1 - prob) * (1 - targets)
     loss = ce_loss * ((1 - p_t) ** gamma)
 
@@ -215,6 +233,43 @@ def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: f
     return loss.mean(1).sum() / num_boxes
 
 
+def sigmoid_focal_loss_with_mode(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2, mode="mask"):
+    """
+    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+        num_boxes: num of prediction instance
+        alpha: (optional) Weighting factor in range (0,1) to balance
+                positive vs negative examples. Default = -1 (no weighting).
+        gamma: Exponent of the modulating factor (1 - p_t) to
+               balance easy vs hard examples.
+        mode: a str, either "mask" or "box". When mode equal "mask", the loss would be averaged in
+              the first dimension.
+    Returns:
+        Loss tensor
+    """
+    assert mode in ["mask", "box"]
+
+    prob = inputs.sigmoid()
+    ce_loss = F.binary_cross_entropy_with_logits(
+        inputs, targets, reduction="none")
+    p_t = prob * targets + (1 - prob) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+
+    if mode == "mask":
+        loss = loss.mean(1)
+
+    return loss.sum() / num_boxes
+
+
 class PostProcessSegm(nn.Module):
     def __init__(self, threshold=0.5):
         super().__init__()
@@ -225,10 +280,14 @@ def forward(self, results, outputs, orig_target_sizes, max_target_sizes):
         assert len(orig_target_sizes) == len(max_target_sizes)
         max_h, max_w = max_target_sizes.max(0)[0].tolist()
         outputs_masks = outputs["pred_masks"].squeeze(2)
-        outputs_masks = F.interpolate(outputs_masks, size=(max_h, max_w), mode="bilinear", align_corners=False)
+        outputs_masks = F.interpolate(
+            outputs_masks, size=(max_h, max_w), mode="bilinear", align_corners=False
+        )
         outputs_masks = (outputs_masks.sigmoid() > self.threshold).cpu()
 
-        for i, (cur_mask, t, tt) in enumerate(zip(outputs_masks, max_target_sizes, orig_target_sizes)):
+        for i, (cur_mask, t, tt) in enumerate(
+            zip(outputs_masks, max_target_sizes, orig_target_sizes)
+        ):
             img_h, img_w = t[0], t[1]
             results[i]["masks"] = cur_mask[:, :img_h, :img_w].unsqueeze(1)
             results[i]["masks"] = F.interpolate(
@@ -240,7 +299,7 @@ def forward(self, results, outputs, orig_target_sizes, max_target_sizes):
 
 class PostProcessPanoptic(nn.Module):
     """This class converts the output of the model to the final panoptic result, in the format expected by the
-    coco panoptic API """
+    coco panoptic API"""
 
     def __init__(self, is_thing_map, threshold=0.85):
         """
@@ -253,19 +312,23 @@ def __init__(self, is_thing_map, threshold=0.85):
         self.threshold = threshold
         self.is_thing_map = is_thing_map
 
-    def forward(self, outputs, processed_sizes, target_sizes=None):
-        """ This function computes the panoptic prediction from the model's predictions.
+    def forward(self, outputs, processed_sizes, target_sizes=None):  # noqa: C901
+        """This function computes the panoptic prediction from the model's predictions.
         Parameters:
             outputs: This is a dict coming directly from the model. See the model doc for the content.
             processed_sizes: This is a list of tuples (or torch tensors) of sizes of the images that were passed to the
                              model, ie the size after data augmentation but before batching.
             target_sizes: This is a list of tuples (or torch tensors) corresponding to the requested final size
                           of each prediction. If left to None, it will default to the processed_sizes
-            """
+        """
         if target_sizes is None:
             target_sizes = processed_sizes
         assert len(processed_sizes) == len(target_sizes)
-        out_logits, raw_masks, raw_boxes = outputs["pred_logits"], outputs["pred_masks"], outputs["pred_boxes"]
+        out_logits, raw_masks, raw_boxes = (
+            outputs["pred_logits"],
+            outputs["pred_masks"],
+            outputs["pred_boxes"],
+        )
         assert len(out_logits) == len(raw_masks) == len(target_sizes)
         preds = []
 
@@ -279,12 +342,16 @@ def to_tuple(tup):
         ):
             # we filter empty queries and detection below threshold
             scores, labels = cur_logits.softmax(-1).max(-1)
-            keep = labels.ne(outputs["pred_logits"].shape[-1] - 1) & (scores > self.threshold)
+            keep = labels.ne(outputs["pred_logits"].shape[-1] - 1) & (
+                scores > self.threshold
+            )
             cur_scores, cur_classes = cur_logits.softmax(-1).max(-1)
             cur_scores = cur_scores[keep]
             cur_classes = cur_classes[keep]
             cur_masks = cur_masks[keep]
-            cur_masks = interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1)
+            cur_masks = interpolate(
+                cur_masks[:, None], to_tuple(size), mode="bilinear"
+            ).squeeze(1)
             cur_boxes = box_ops.box_cxcywh_to_xyxy(cur_boxes[keep])
 
             h, w = cur_masks.shape[-2:]
@@ -306,7 +373,8 @@ def get_ids_area(masks, scores, dedup=False):
 
                 if m_id.shape[-1] == 0:
                     # We didn't detect any mask :(
-                    m_id = torch.zeros((h, w), dtype=torch.long, device=m_id.device)
+                    m_id = torch.zeros(
+                        (h, w), dtype=torch.long, device=m_id.device)
                 else:
                     m_id = m_id.argmax(-1).view(h, w)
 
@@ -319,11 +387,17 @@ def get_ids_area(masks, scores, dedup=False):
 
                 final_h, final_w = to_tuple(target_size)
 
-                seg_img = Image.fromarray(id2rgb(m_id.view(h, w).cpu().numpy()))
-                seg_img = seg_img.resize(size=(final_w, final_h), resample=Image.NEAREST)
+                seg_img = Image.fromarray(
+                    id2rgb(m_id.view(h, w).cpu().numpy()))
+                seg_img = seg_img.resize(
+                    size=(final_w, final_h), resample=Image.NEAREST
+                )
 
                 np_seg_img = (
-                    torch.ByteTensor(torch.ByteStorage.from_buffer(seg_img.tobytes())).view(final_h, final_w, 3).numpy()
+                    torch.ByteTensor(
+                        torch.ByteStorage.from_buffer(seg_img.tobytes()))
+                    .view(final_h, final_w, 3)
+                    .numpy()
                 )
                 m_id = torch.from_numpy(rgb2id(np_seg_img))
 
@@ -337,7 +411,9 @@ def get_ids_area(masks, scores, dedup=False):
                 # We know filter empty masks as long as we find some
                 while True:
                     filtered_small = torch.as_tensor(
-                        [area[i] <= 4 for i, c in enumerate(cur_classes)], dtype=torch.bool, device=keep.device
+                        [area[i] <= 4 for i, c in enumerate(cur_classes)],
+                        dtype=torch.bool,
+                        device=keep.device,
                     )
                     if filtered_small.any().item():
                         cur_scores = cur_scores[~filtered_small]
@@ -348,16 +424,27 @@ def get_ids_area(masks, scores, dedup=False):
                         break
 
             else:
-                cur_classes = torch.ones(1, dtype=torch.long, device=cur_classes.device)
+                cur_classes = torch.ones(
+                    1, dtype=torch.long, device=cur_classes.device)
 
             segments_info = []
             for i, a in enumerate(area):
                 cat = cur_classes[i].item()
-                segments_info.append({"id": i, "isthing": self.is_thing_map[cat], "category_id": cat, "area": a})
+                segments_info.append(
+                    {
+                        "id": i,
+                        "isthing": self.is_thing_map[cat],
+                        "category_id": cat,
+                        "area": a,
+                    }
+                )
             del cur_classes
 
             with io.BytesIO() as out:
                 seg_img.save(out, format="PNG")
-                predictions = {"png_string": out.getvalue(), "segments_info": segments_info}
+                predictions = {
+                    "png_string": out.getvalue(),
+                    "segments_info": segments_info,
+                }
             preds.append(predictions)
         return preds
diff --git a/yolov7/modeling/meta_arch/maskrcnn_seg.py b/yolov7/modeling/meta_arch/maskrcnn_seg.py
old mode 100755
new mode 100644
diff --git a/yolov7/modeling/meta_arch/smca_detr.py b/yolov7/modeling/meta_arch/smca_detr.py
new file mode 100644
index 0000000..a7109c9
--- /dev/null
+++ b/yolov7/modeling/meta_arch/smca_detr.py
@@ -0,0 +1,748 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from collections import OrderedDict
+import logging
+import math
+from typing import List, Dict
+
+import numpy as np
+from numpy.core.fromnumeric import sort
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from scipy.optimize import linear_sum_assignment
+from torch import nn
+
+from detectron2.utils import comm
+from detectron2.layers import ShapeSpec
+from detectron2.modeling import META_ARCH_REGISTRY, build_backbone, detector_postprocess
+from detectron2.structures import Boxes, ImageList, Instances, BitMasks, PolygonMasks
+from detectron2.utils.logger import log_first_n
+from fvcore.nn import giou_loss, smooth_l1_loss
+import torchvision
+
+from yolov7.utils.detr_utils import HungarianMatcherSMCA
+from yolov7.utils.boxes import box_cxcywh_to_xyxy, box_xyxy_to_cxcywh, convert_coco_poly_to_mask, generalized_box_iou
+from yolov7.utils.misc import NestedTensor, nested_tensor_from_tensor_list, is_dist_avail_and_initialized, accuracy
+
+from alfred.utils.log import logger
+from alfred.dl.torch.common import device
+
+import pickle
+
+from ..backbone.smcadetr_backbone import Joiner, PositionEmbeddingSine, Transformer, MLP
+from .detr_seg import DETRsegm, PostProcessPanoptic, PostProcessSegm, sigmoid_focal_loss_with_mode, dice_loss
+
+__all__ = ["Detr"]
+
+
+@META_ARCH_REGISTRY.register()
+class SMCADetr(nn.Module):
+    """
+    Implement Detr
+    """
+
+    def __init__(self, cfg):
+        super().__init__()
+
+        self.device = torch.device(cfg.MODEL.DEVICE)
+
+        self.conf_thresh = cfg.MODEL.YOLO.CONF_THRESHOLD
+        self.ignore_thresh = cfg.MODEL.YOLO.IGNORE_THRESHOLD
+        self.num_classes = cfg.MODEL.DETR.NUM_CLASSES
+        self.mask_on = cfg.MODEL.MASK_ON
+        hidden_dim = cfg.MODEL.DETR.HIDDEN_DIM
+        self.num_queries = cfg.MODEL.DETR.NUM_OBJECT_QUERIES
+        # Transformer parameters:
+        nheads = cfg.MODEL.DETR.NHEADS
+        dropout = cfg.MODEL.DETR.DROPOUT
+        dim_feedforward = cfg.MODEL.DETR.DIM_FEEDFORWARD
+        enc_layers = cfg.MODEL.DETR.ENC_LAYERS
+        dec_layers = cfg.MODEL.DETR.DEC_LAYERS
+        num_feature_levels = cfg.MODEL.DETR.NUM_FEATURE_LEVELS
+        pre_norm = cfg.MODEL.DETR.PRE_NORM
+        pretrained_weights = cfg.MODEL.WEIGHTS
+
+        # Loss parameters:
+        giou_weight = cfg.MODEL.DETR.GIOU_WEIGHT
+        l1_weight = cfg.MODEL.DETR.L1_WEIGHT
+        deep_supervision = cfg.MODEL.DETR.DEEP_SUPERVISION
+        no_object_weight = cfg.MODEL.DETR.NO_OBJECT_WEIGHT
+
+        N_steps = hidden_dim // 2
+        d2_backbone = MaskedBackboneTraceFriendly(cfg)
+        backbone = Joiner(d2_backbone, PositionEmbeddingSine(
+            N_steps, normalize=True))
+        backbone.num_channels = d2_backbone.num_channels
+
+        # type1 for no scale, type 2 for dynamic scale, type 3 for dyanmic xy scale, type 4 for covariance matrix scale
+        dynamic_scale = 'type3'
+        transformer = Transformer(
+            d_model=hidden_dim,
+            dropout=dropout,
+            nhead=nheads,
+            dim_feedforward=dim_feedforward,
+            num_encoder_layers=enc_layers,
+            num_decoder_layers=dec_layers,
+            normalize_before=pre_norm,
+            return_intermediate_dec=deep_supervision,
+            dynamic_scale=dynamic_scale
+        )
+
+        self.detr = DETR(
+            backbone, transformer, num_classes=self.num_classes, num_queries=self.num_queries, num_feature_levels=num_feature_levels, aux_loss=deep_supervision
+        )
+        if self.mask_on:
+            frozen_weights = cfg.MODEL.DETR.FROZEN_WEIGHTS
+            if frozen_weights != '':
+                print("LOAD pre-trained weights")
+                weight = torch.load(
+                    frozen_weights, map_location=lambda storage, loc: storage)['model']
+                new_weight = {}
+                for k, v in weight.items():
+                    if 'detr.' in k:
+                        new_weight[k.replace('detr.', '')] = v
+                    else:
+                        print(f"Skipping loading weight {k} from frozen model")
+                del weight
+                self.detr.load_state_dict(new_weight)
+                del new_weight
+            self.detr = DETRsegm(self.detr, freeze_detr=(frozen_weights != ''))
+            self.seg_postprocess = PostProcessSegm
+
+        # if pretrained_weights:
+        #     logger.info(f'Loading pretrained weights from: {pretrained_weights}')
+        #     wgts = torch.load(pretrained_weights, map_location=lambda storage, loc: storage)
+        #     new_weight = {}
+        #     for k, v in wgts.items():
+        #         new_weight[k] = v
+        #     del wgts
+        #     self.detr.load_state_dict(new_weight)
+        #     del new_weight
+
+        self.detr.to(self.device)
+
+        # building criterion
+        matcher = HungarianMatcherSMCA(
+            cost_class=1, cost_bbox=l1_weight, cost_giou=giou_weight)
+        weight_dict = {"loss_ce": 2, "loss_bbox": l1_weight}
+        weight_dict["loss_giou"] = giou_weight
+        if deep_supervision:
+            aux_weight_dict = {}
+            for i in range(dec_layers - 1):
+                aux_weight_dict.update(
+                    {k + f"_{i}": v for k, v in weight_dict.items()})
+            weight_dict.update(aux_weight_dict)
+        losses = ["labels", "boxes", "cardinality"]
+        if self.mask_on:
+            losses += ["masks"]
+        self.criterion = SetCriterion(
+            self.num_classes, matcher=matcher, weight_dict=weight_dict, eos_coef=no_object_weight, losses=losses,
+        )
+        self.criterion.to(self.device)
+
+        pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(
+            self.device).view(3, 1, 1)
+        pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(
+            self.device).view(3, 1, 1)
+        self.normalizer = lambda x: (x - pixel_mean) / pixel_std
+        self.to(self.device)
+
+    def forward(self, batched_inputs):
+        """
+        Args:
+            batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
+                Each item in the list contains the inputs for one image.
+                For now, each item in the list is a dict that contains:
+
+                * image: Tensor, image in (C, H, W) format.
+                * instances: Instances
+
+                Other information that's included in the original dicts, such as:
+
+                * "height", "width" (int): the output resolution of the model, used in inference.
+                  See :meth:`postprocess` for details.
+        Returns:
+            dict[str: Tensor]:
+                mapping from a named loss to a tensor storing the loss. Used during training only.
+        """
+        images = self.preprocess_image(batched_inputs)
+        output = self.detr(images)
+
+        if self.training:
+            gt_instances = [x["instances"].to(
+                self.device) for x in batched_inputs]
+
+            targets = self.prepare_targets(gt_instances)
+            loss_dict = self.criterion(output, targets)
+            weight_dict = self.criterion.weight_dict
+            valid_loss_dict = {}
+            for k in loss_dict.keys():
+                if k in weight_dict:
+                    valid_loss_dict[k] = loss_dict[k] * weight_dict[k]
+                    # loss_dict[k] *= weight_dict[k]
+            # print(loss_dict)
+            # return loss_dict
+            return valid_loss_dict
+        else:
+            box_cls = output["pred_logits"]
+            box_pred = output["pred_boxes"]
+            mask_pred = output["pred_masks"] if self.mask_on else None
+            results = self.inference(
+                box_cls, box_pred, mask_pred, images.image_sizes)
+            processed_results = []
+            for results_per_image, input_per_image, image_size in zip(results, batched_inputs, images.image_sizes):
+                height = input_per_image.get("height", image_size[0])
+                width = input_per_image.get("width", image_size[1])
+                r = detector_postprocess(results_per_image, height, width)
+                processed_results.append({"instances": r})
+            return processed_results
+
+    def prepare_targets(self, targets):
+        new_targets = []
+        for targets_per_image in targets:
+            h, w = targets_per_image.image_size
+            image_size_xyxy = torch.as_tensor(
+                [w, h, w, h], dtype=torch.float, device=self.device)
+            gt_classes = targets_per_image.gt_classes
+            gt_boxes = targets_per_image.gt_boxes.tensor / image_size_xyxy
+            gt_boxes = box_xyxy_to_cxcywh(gt_boxes)
+            new_targets.append({"labels": gt_classes, "boxes": gt_boxes})
+            if self.mask_on and hasattr(targets_per_image, 'gt_masks'):
+                gt_masks = targets_per_image.gt_masks
+                gt_masks = convert_coco_poly_to_mask(gt_masks.polygons, h, w)
+                new_targets[-1].update({'masks': gt_masks})
+        return new_targets
+
+    def inference(self, box_cls, box_pred, mask_pred, image_sizes):
+        """
+        Arguments:
+            box_cls (Tensor): tensor of shape (batch_size, num_queries, K).
+                The tensor predicts the classification probability for each query.
+            box_pred (Tensor): tensors of shape (batch_size, num_queries, 4).
+                The tensor predicts 4-vector (x,y,w,h) box
+                regression values for every queryx
+            image_sizes (List[torch.Size]): the input image sizes
+        Returns:
+            results (List[Instances]): a list of #images elements.
+        """
+        assert len(box_cls) == len(image_sizes)
+        results = []
+
+        prob = box_cls.sigmoid()
+        # TODO make top-100 as an option for non-focal-loss as well
+        scores, topk_indexes = torch.topk(
+            prob.view(box_cls.shape[0], -1), 100, dim=1
+        )
+        topk_boxes = topk_indexes // box_cls.shape[2]
+        labels = topk_indexes % box_cls.shape[2]
+
+        for i, (scores_per_image, labels_per_image, box_pred_per_image, image_size) in enumerate(zip(
+            scores, labels, box_pred, image_sizes
+        )):
+            result = Instances(image_size)
+            boxes = box_cxcywh_to_xyxy(box_pred_per_image)
+            boxes = torch.gather(
+                boxes, 0, topk_boxes[i].unsqueeze(-1).repeat(1, 4))
+            result.pred_boxes = Boxes(boxes)
+
+            result.pred_boxes.scale(
+                scale_x=image_size[1], scale_y=image_size[0])
+            if self.mask_on:
+                mask = F.interpolate(mask_pred[i].unsqueeze(
+                    0), size=image_size, mode='bilinear', align_corners=False)
+                mask = mask[0].sigmoid() > 0.5
+                B, N, H, W = mask_pred.shape
+                # print('mask_pred shape: ', mask.shape)
+                # mask = BitMasks(mask.cpu()).crop_and_resize(result.pred_boxes.tensor.cpu(), 32)
+                mask = BitMasks(mask.cpu())
+                # result.pred_masks = mask.unsqueeze(1).to(mask_pred[0].device)
+                result.pred_bit_masks = mask.to(mask_pred[i].device)
+            # print('box_pred_per_image: ', box_pred_per_image.shape)
+            result.scores = scores_per_image
+            result.pred_classes = labels_per_image
+            results.append(result)
+        return results
+
+    def preprocess_image(self, batched_inputs):
+        """
+        Normalize, pad and batch the input images.
+        """
+        images = [self.normalizer(x["image"].to(self.device))
+                  for x in batched_inputs]
+        images = ImageList.from_tensors(images)
+        return images
+
+
+def bias_init_with_prob(prior_prob):
+    """initialize conv/fc bias value according to giving probablity."""
+    bias_init = float(-np.log((1 - prior_prob) / prior_prob))
+    return bias_init
+
+
+class MaskedBackboneTraceFriendly(nn.Module):
+    """ 
+    This is a thin wrapper around D2's backbone to provide padding masking.
+    I change it into tracing friendly with this mask operation.
+    """
+
+    def __init__(self, cfg):
+        super().__init__()
+        self.backbone = build_backbone(cfg)
+        self.num_feature_levels = cfg.MODEL.DETR.NUM_FEATURE_LEVELS
+
+        # if comm.is_main_process():
+        #     a = torch.randn([1, 3, 256, 256]).to(device)
+        #     b = self.backbone.model(a)
+        #     print('B: ', b)
+        # self.backbone = torchvision.models.resnet50(pretrained=True)
+        backbone_shape = self.backbone.output_shape()
+
+        # pretrained_weights = cfg.MODEL.WEIGHTS
+        # if pretrained_weights:
+        #     logger.info(f'Loading pretrained weights from: {pretrained_weights}')
+        #     with open(pretrained_weights, 'rb') as f:
+        #         wgts = pickle.load(f, encoding='latin1')['model']
+        #     # wgts = torch.load(pretrained_weights, map_location=lambda storage, loc: storage)
+        #     new_weight = {}
+        #     for k, v in wgts.items():
+        #         v = torch.from_numpy(v)
+        #         # new_weight['detr.' + k] = v
+        #         new_weight[k] = v
+        #     del wgts
+        #     self.backbone.load_state_dict(new_weight, strict=False)
+        #     del new_weight
+
+        # if comm.is_main_process():
+        #     c = self.backbone.model(a)
+        #     print('C: ', c)
+
+        if self.num_feature_levels > 1:
+            self.num_channels = [512, 1024, 2048]
+            self.return_interm_layers = ['res3', 'res4', 'res5']
+            self.feature_strides = [8, 16, 32]
+            self.strides = [8, 16, 32]
+        else:
+            self.num_channels = [2048]
+            self.return_interm_layers = ['res5']
+            self.feature_strides = [32]
+            self.strides = [32]
+
+        self.onnx_export = False
+
+    def forward(self, images):
+        if isinstance(images, ImageList):
+            features = self.backbone(images.tensor)
+            device = images.tensor.device
+        else:
+            features = self.backbone(images.tensors)
+            device = images.tensors.device
+
+        if self.onnx_export:
+            logger.info('[onnx export] in MaskedBackbone...')
+            out: Dict[str, NestedTensor] = {}
+            for name, x in features.items():
+                m = images.mask
+                print('m: ', m)
+                print('m: ', m.shape)
+                assert m is not None
+                sp = x.shape[-2:]
+                # mask = F.interpolate(m.to(torch.float), size=sp).to(torch.bool)[0]
+                # mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
+                m = m.unsqueeze(0).float()
+                mask = F.interpolate(m, size=x.shape[-2:]).to(torch.bool)[0]
+                print(mask.shape)
+                out[name] = NestedTensor(x, mask)
+            return out
+        else:
+            # masks = self.mask_out_padding(
+            #     [features_per_level.shape for features_per_level in features.values()],
+            #     images.image_sizes,
+            #     device,
+            # )
+            # assert len(features) == len(masks)
+            # for i, k in enumerate(features.keys()):
+            #     features[k] = NestedTensor(features[k], masks[i])
+            # return features
+            # features: res2, res3, res4, res5
+            features_returned = OrderedDict()
+            for l in self.return_interm_layers:
+                features_returned[l] = features[l]
+
+            masks = self.mask_out_padding(
+                [
+                    features_per_level.shape
+                    for features_per_level in features_returned.values()
+                ],
+                images.image_sizes,
+                device,
+            )
+            assert len(features_returned) == len(masks)
+
+            out_nested_features = OrderedDict()
+            for i, k in enumerate(self.return_interm_layers):
+                out_nested_features[k] = NestedTensor(
+                    features_returned[k], masks[i])
+            return out_nested_features
+
+    def mask_out_padding(self, feature_shapes, image_sizes, device):
+        masks = []
+        assert len(feature_shapes) == len(self.feature_strides)
+        for idx, shape in enumerate(feature_shapes):
+            N, _, H, W = shape
+            masks_per_feature_level = torch.ones(
+                (N, H, W), dtype=torch.bool, device=device)
+            for img_idx, (h, w) in enumerate(image_sizes):
+                # print('H', H, 'W', W, 'ceil: ', int(np.ceil(float(h) / self.feature_strides[idx])),)
+                masks_per_feature_level[
+                    img_idx,
+                    : int(np.ceil(float(h) / self.feature_strides[idx])),
+                    : int(np.ceil(float(w) / self.feature_strides[idx])),
+                ] = 0
+            masks.append(masks_per_feature_level)
+        return masks
+
+
+class DETR(nn.Module):
+    """ This is the DETR module that performs object detection """
+
+    def __init__(self, backbone, transformer, num_classes, num_queries, num_feature_levels, aux_loss=False, use_focal_loss=True,):
+        """ Initializes the model.
+        Parameters:
+            backbone: torch module of the backbone to be used. See backbone.py
+            transformer: torch module of the transformer architecture. See transformer.py
+            num_classes: number of object classes
+            num_queries: number of object queries, ie detection slot. This is the maximal number of objects
+                         DETR can detect in a single image. For COCO, we recommend 100 queries.
+            aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
+        """
+        super().__init__()
+        self.num_queries = num_queries
+        self.transformer = transformer
+        hidden_dim = transformer.d_model
+        # self.class_embed = nn.Linear(hidden_dim, num_classes + 1)
+        self.class_embed = nn.Linear(
+            hidden_dim, num_classes if use_focal_loss else num_classes + 1
+        )
+        nn.init.constant_(self.class_embed.bias, bias_init_with_prob(0.01))
+        self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
+        self.query_embed = nn.Embedding(num_queries, hidden_dim)
+        # self.input_proj = nn.Conv2d(
+        #     backbone.num_channels, hidden_dim, kernel_size=1)
+        if num_feature_levels > 1:
+            num_backbone_outs = len(backbone[0].strides)
+            input_proj_list = []
+            for _ in range(num_backbone_outs):
+                in_channels = backbone.num_channels[_]
+                if _ == 0:
+                    input_proj_list.append(nn.Sequential(
+                        nn.Conv2d(in_channels, hidden_dim,
+                                  kernel_size=3, stride=2, padding=1),
+                        nn.GroupNorm(32, hidden_dim),
+                    ))
+                else:
+                    input_proj_list.append(nn.Sequential(
+                        nn.Conv2d(in_channels, hidden_dim, kernel_size=1),
+                        nn.GroupNorm(32, hidden_dim),
+                    ))
+            self.input_proj = nn.ModuleList(input_proj_list)
+        else:
+            # self.input_proj = nn.ModuleList([
+            #     nn.Sequential(
+            #         nn.Conv2d(
+            #             backbone.num_channels[0], hidden_dim, kernel_size=1),
+            #         nn.GroupNorm(32, hidden_dim),
+            #     )])
+            self.input_proj = nn.Conv2d(
+                backbone.num_channels[-1], hidden_dim, kernel_size=1)
+
+        self.backbone = backbone
+        self.aux_loss = aux_loss
+
+    def forward(self, samples: NestedTensor):
+        """ The forward expects a NestedTensor, which consists of:
+               - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
+               - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels
+
+            It returns a dict with the following elements:
+               - "pred_logits": the classification logits (including no-object) for all queries.
+                                Shape= [batch_size x num_queries x (num_classes + 1)]
+               - "pred_boxes": The normalized boxes coordinates for all queries, represented as
+                               (center_x, center_y, height, width). These values are normalized in [0, 1],
+                               relative to the size of each individual image (disregarding possible padding).
+                               See PostProcess for information on how to retrieve the unnormalized bounding box.
+               - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
+                                dictionnaries containing the two above keys for each decoder layer.
+        """
+        if isinstance(samples, (list, torch.Tensor)):
+            samples = nested_tensor_from_tensor_list(samples)
+        features, pos = self.backbone(samples)
+        # print(samples)
+        # print(samples[1])
+        # h_w = torch.stack([torch.stack([inst['size'] for inst in samples[1]])[:, 1],
+        #                    torch.stack([inst['size'] for inst in samples[1]])[:, 0]], dim=-1)
+        # h_w = h_w.unsqueeze(0)
+        h_w = torch.stack([torch.stack([torch.tensor(inst) for inst in samples.image_sizes])[:, 1],
+                           torch.stack([torch.tensor(inst) for inst in samples.image_sizes])[:, 0]], dim=-1)
+        # print(h_w)
+        # h_w = torch.tensor(samples.image_sizes).to(device)
+        h_w = h_w.unsqueeze(0).to(device)
+
+        src, mask = features[-1].decompose()
+        # print(f'{src.shape} {mask.shape}')
+        assert mask is not None
+        hs, points = self.transformer(self.input_proj(
+            src), mask, self.query_embed.weight, pos[-1], h_w)
+        num_decoder = hs.shape[0]
+
+        outputs_class = self.class_embed(hs)
+        outputs_coord = self.bbox_embed(hs)
+
+        points = points.unsqueeze(0).repeat(num_decoder, 1, 1, 1)
+
+        outputs_coord[..., :2] = outputs_coord[..., :2] + points
+        outputs_coord = outputs_coord.sigmoid()
+        out = {'pred_logits': outputs_class[-1],
+               'pred_boxes': outputs_coord[-1]}
+        if self.aux_loss:
+            out['aux_outputs'] = self._set_aux_loss(
+                outputs_class, outputs_coord)
+        return out
+
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_coord):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        return [{'pred_logits': a, 'pred_boxes': b}
+                for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
+
+
+class SetCriterion(nn.Module):
+    """ This class computes the loss for DETR.
+    The process happens in two steps:
+        1) we compute hungarian assignment between ground truth boxes and the outputs of the model
+        2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
+    """
+
+    def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses):
+        """ Create the criterion.
+        Parameters:
+            num_classes: number of object categories, omitting the special no-object category
+            matcher: module able to compute a matching between targets and proposals
+            weight_dict: dict containing as key the names of the losses and as values their relative weight.
+            eos_coef: relative classification weight applied to the no-object category
+            losses: list of all the losses to be applied. See get_loss for list of available losses.
+        """
+        super().__init__()
+        self.num_classes = num_classes
+        self.matcher = matcher
+        self.weight_dict = weight_dict
+        self.eos_coef = eos_coef
+        self.losses = losses
+        empty_weight = torch.ones(self.num_classes + 1)
+        empty_weight[-1] = self.eos_coef
+        self.register_buffer('empty_weight', empty_weight)
+
+    def loss_labels(self, outputs, targets, indices, num_boxes, log=True):
+        """Classification loss (NLL)
+        targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
+        """
+        assert 'pred_logits' in outputs
+        src_logits = outputs['pred_logits']
+
+        idx = self._get_src_permutation_idx(indices)
+        target_classes_o = torch.cat([t["labels"][J]
+                                     for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(src_logits.shape[:2], self.num_classes,
+                                    dtype=torch.int64, device=src_logits.device)
+        target_classes[idx] = target_classes_o
+
+        src_logits_orig = src_logits
+        src_logits = src_logits.flatten(0, 1)
+        target_classes = target_classes.flatten(0, 1)
+        pos_inds = torch.nonzero(
+            target_classes != self.num_classes, as_tuple=True)[0]
+        labels = torch.zeros_like(src_logits)
+        labels[pos_inds, target_classes[pos_inds]] = 1
+        loss_ce = sigmoid_focal_loss_with_mode(
+            src_logits, labels, num_boxes, mode="box")
+        losses = {'loss_ce': loss_ce}
+
+        if log:
+            # TODO this should probably be a separate loss, not hacked in this one here
+            losses['class_error'] = 100 - \
+                accuracy(src_logits_orig[idx], target_classes_o)[0]
+        return losses
+
+    @torch.no_grad()
+    def loss_cardinality(self, outputs, targets, indices, num_boxes):
+        """ Compute the cardinality error, ie the absolute error in the number of predicted non-empty boxes
+        This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients
+        """
+        pred_logits = outputs['pred_logits']
+        device = pred_logits.device
+        tgt_lengths = torch.as_tensor(
+            [len(v["labels"]) for v in targets], device=device)
+        # Count the number of predictions that are NOT "no-object" (which is the last class)
+        card_pred = (pred_logits.argmax(-1) !=
+                     pred_logits.shape[-1] - 1).sum(1)
+        card_err = F.l1_loss(card_pred.float(), tgt_lengths.float())
+        losses = {'cardinality_error': card_err}
+        return losses
+
+    def loss_boxes(self, outputs, targets, indices, num_boxes):
+        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
+           targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
+           The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size.
+        """
+        assert 'pred_boxes' in outputs
+        idx = self._get_src_permutation_idx(indices)
+        src_boxes = outputs['pred_boxes'][idx]
+        target_boxes = torch.cat([t['boxes'][i]
+                                 for t, (_, i) in zip(targets, indices)], dim=0)
+
+        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none')
+
+        losses = {}
+        losses['loss_bbox'] = loss_bbox.sum() / num_boxes
+
+        loss_giou = 1 - torch.diag(generalized_box_iou(
+            box_cxcywh_to_xyxy(src_boxes),
+            box_cxcywh_to_xyxy(target_boxes)))
+        losses['loss_giou'] = loss_giou.sum() / num_boxes
+        return losses
+
+    def loss_masks(self, outputs, targets, indices, num_boxes):
+        """Compute the losses related to the masks: the focal loss and the dice loss.
+           targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]
+        """
+        assert "pred_masks" in outputs
+
+        src_idx = self._get_src_permutation_idx(indices)
+        tgt_idx = self._get_tgt_permutation_idx(indices)
+        src_masks = outputs["pred_masks"]
+        src_masks = src_masks[src_idx]
+        masks = [t["masks"] for t in targets]
+        # TODO use valid to mask invalid areas due to padding in loss
+        target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
+        target_masks = target_masks.to(src_masks)
+        target_masks = target_masks[tgt_idx]
+
+        # upsample predictions to the target size
+        src_masks = interpolate(src_masks[:, None], size=target_masks.shape[-2:],
+                                mode="bilinear", align_corners=False)
+        src_masks = src_masks[:, 0].flatten(1)
+
+        target_masks = target_masks.flatten(1)
+        target_masks = target_masks.view(src_masks.shape)
+        losses = {
+            "loss_mask": sigmoid_focal_loss_with_mode(src_masks, target_masks, num_boxes),
+            "loss_dice": dice_loss(src_masks, target_masks, num_boxes),
+        }
+        return losses
+
+    def _get_src_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat([torch.full_like(src, i)
+                              for i, (src, _) in enumerate(indices)])
+        src_idx = torch.cat([src for (src, _) in indices])
+        return batch_idx, src_idx
+
+    def _get_tgt_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat([torch.full_like(tgt, i)
+                              for i, (_, tgt) in enumerate(indices)])
+        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
+        return batch_idx, tgt_idx
+
+    def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs):
+        loss_map = {
+            'labels': self.loss_labels,
+            'cardinality': self.loss_cardinality,
+            'boxes': self.loss_boxes,
+            'masks': self.loss_masks
+        }
+        assert loss in loss_map, f'do you really want to compute {loss} loss?'
+        return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs)
+
+    def forward(self, outputs, targets):
+        """ This performs the loss computation.
+        Parameters:
+             outputs: dict of tensors, see the output specification of the model for the format
+             targets: list of dicts, such that len(targets) == batch_size.
+                      The expected keys in each dict depends on the losses applied, see each loss' doc
+        """
+        outputs_without_aux = {k: v for k,
+                               v in outputs.items() if k != 'aux_outputs'}
+
+        # Retrieve the matching between the outputs of the last layer and the targets
+        indices = self.matcher(outputs_without_aux, targets)
+
+        # Compute the average number of target boxes accross all nodes, for normalization purposes
+        num_boxes = sum(len(t["labels"]) for t in targets)
+        num_boxes = torch.as_tensor(
+            [num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
+        if is_dist_avail_and_initialized():
+            torch.distributed.all_reduce(num_boxes)
+        num_boxes = torch.clamp(
+            num_boxes / comm.get_world_size(), min=1).item()
+
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            losses.update(self.get_loss(
+                loss, outputs, targets, indices, num_boxes))
+
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if 'aux_outputs' in outputs:
+            for i, aux_outputs in enumerate(outputs['aux_outputs']):
+                indices = self.matcher(aux_outputs, targets)
+                for loss in self.losses:
+                    if loss == 'masks':
+                        # Intermediate masks losses are too costly to compute, we ignore them.
+                        continue
+                    kwargs = {}
+                    if loss == 'labels':
+                        # Logging is enabled only for the last layer
+                        kwargs = {'log': False}
+                    l_dict = self.get_loss(
+                        loss, aux_outputs, targets, indices, num_boxes, **kwargs)
+                    l_dict = {k + f'_{i}': v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+
+        return losses
+
+
+class PostProcess(nn.Module):
+    """ This module converts the model's output into the format expected by the coco api"""
+    @torch.no_grad()
+    def forward(self, outputs, target_sizes):
+        """ Perform the computation
+        Parameters:
+            outputs: raw outputs of the model
+            target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch
+                          For evaluation, this must be the original image size (before any data augmentation)
+                          For visualization, this should be the image size after data augment, but before padding
+        """
+        out_logits, out_bbox = outputs['pred_logits'], outputs['pred_boxes']
+
+        assert len(out_logits) == len(target_sizes)
+        assert target_sizes.shape[1] == 2
+
+        prob = F.softmax(out_logits, -1)
+        scores, labels = prob[..., :-1].max(-1)
+
+        # convert to [x0, y0, x1, y1] format
+        boxes = box_cxcywh_to_xyxy(out_bbox)
+        # and from relative [0, 1] to absolute [0, height] coordinates
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+        boxes = boxes * scale_fct[:, None, :]
+
+        results = [{
+            'scores': s,
+            'labels': l,
+            'boxes': b
+        } for s, l, b in zip(scores, labels, boxes)]
+
+        return results
diff --git a/yolov7/modeling/meta_arch/solov2.py b/yolov7/modeling/meta_arch/solov2.py
old mode 100755
new mode 100644
index e9c0fa2..5847120
--- a/yolov7/modeling/meta_arch/solov2.py
+++ b/yolov7/modeling/meta_arch/solov2.py
@@ -19,6 +19,7 @@
 from fvcore.nn import sigmoid_focal_loss_jit
 
 from yolov7.utils.solov2_utils import imrescale, center_of_mass, point_nms, mask_nms, matrix_nms
+from ..head.solov2_head import SOLOv2InsHead, SOLOv2MaskHead
 from ..loss.loss import dice_loss, FocalLoss
 from alfred.utils.log import logger
 from alfred.vis.image.det import visualize_det_cv2_part, visualize_det_cv2_fancy
@@ -771,251 +772,3 @@ def inference_single_image_onnx(
             keep = keep.long()
             return seg_masks, cate_scores, cate_labels, keep
 
-
-class SOLOv2InsHead(nn.Module):
-    def __init__(self, cfg, input_shape: List[ShapeSpec]):
-        """
-        SOLOv2 Instance Head.
-        """
-        super().__init__()
-        # fmt: off
-        self.num_classes = cfg.MODEL.SOLOV2.NUM_CLASSES
-        self.num_kernels = cfg.MODEL.SOLOV2.NUM_KERNELS
-        self.num_grids = cfg.MODEL.SOLOV2.NUM_GRIDS
-        self.instance_in_features = cfg.MODEL.SOLOV2.INSTANCE_IN_FEATURES
-        self.instance_strides = cfg.MODEL.SOLOV2.FPN_INSTANCE_STRIDES
-        # = fpn.
-        self.instance_in_channels = cfg.MODEL.SOLOV2.INSTANCE_IN_CHANNELS
-        self.instance_channels = cfg.MODEL.SOLOV2.INSTANCE_CHANNELS
-        # Convolutions to use in the towers
-        self.type_dcn = cfg.MODEL.SOLOV2.TYPE_DCN
-        self.num_levels = len(self.instance_in_features)
-        assert self.num_levels == len(self.instance_strides), \
-            print("Strides should match the features.")
-        # fmt: on
-
-        head_configs = {"cate": (cfg.MODEL.SOLOV2.NUM_INSTANCE_CONVS,
-                                 cfg.MODEL.SOLOV2.USE_DCN_IN_INSTANCE,
-                                 False),
-                        "kernel": (cfg.MODEL.SOLOV2.NUM_INSTANCE_CONVS,
-                                   cfg.MODEL.SOLOV2.USE_DCN_IN_INSTANCE,
-                                   cfg.MODEL.SOLOV2.USE_COORD_CONV)
-                        }
-
-        norm = None if cfg.MODEL.SOLOV2.NORM == "none" else cfg.MODEL.SOLOV2.NORM
-        in_channels = [s.channels for s in input_shape]
-        assert len(set(in_channels)) == 1, \
-            print("Each level must have the same channel!")
-        in_channels = in_channels[0]
-        assert in_channels == cfg.MODEL.SOLOV2.INSTANCE_IN_CHANNELS, \
-            print("In channels should equal to tower in channels!")
-
-        for head in head_configs:
-            tower = []
-            num_convs, use_deformable, use_coord = head_configs[head]
-            for i in range(num_convs):
-                conv_func = nn.Conv2d
-                if i == 0:
-                    if use_coord:
-                        chn = self.instance_in_channels + 2
-                    else:
-                        chn = self.instance_in_channels
-                else:
-                    chn = self.instance_channels
-
-                tower.append(conv_func(
-                    chn, self.instance_channels,
-                    kernel_size=3, stride=1,
-                    padding=1, bias=norm is None
-                ))
-                if norm == "GN":
-                    tower.append(nn.GroupNorm(32, self.instance_channels))
-                tower.append(nn.ReLU(inplace=True))
-            self.add_module('{}_tower'.format(head),
-                            nn.Sequential(*tower))
-
-        self.cate_pred = nn.Conv2d(
-            self.instance_channels, self.num_classes,
-            kernel_size=3, stride=1, padding=1
-        )
-        self.kernel_pred = nn.Conv2d(
-            self.instance_channels, self.num_kernels,
-            kernel_size=3, stride=1, padding=1
-        )
-
-        for modules in [
-            self.cate_tower, self.kernel_tower,
-            self.cate_pred, self.kernel_pred,
-        ]:
-            for l in modules.modules():
-                if isinstance(l, nn.Conv2d):
-                    torch.nn.init.normal_(l.weight, std=0.01)
-                    if l.bias is not None:
-                        nn.init.constant_(l.bias, 0)
-
-        # initialize the bias for focal loss
-        prior_prob = cfg.MODEL.SOLOV2.PRIOR_PROB
-        bias_value = -math.log((1 - prior_prob) / prior_prob)
-        torch.nn.init.constant_(self.cate_pred.bias, bias_value)
-
-    def forward(self, features):
-        """
-        Arguments:
-            features (list[Tensor]): FPN feature map tensors in high to low resolution.
-                Each tensor in the list correspond to different feature levels.
-
-        Returns:
-            pass
-        """
-        cate_pred = []
-        kernel_pred = []
-
-        for idx, feature in enumerate(features):
-            ins_kernel_feat = feature
-            # concat coord
-            x_range = torch.linspace(-1, 1,
-                                     ins_kernel_feat.shape[-1], device=ins_kernel_feat.device)
-            y_range = torch.linspace(-1, 1,
-                                     ins_kernel_feat.shape[-2], device=ins_kernel_feat.device)
-            y, x = torch.meshgrid(y_range, x_range)
-            y = y.expand([ins_kernel_feat.shape[0], 1, -1, -1])
-            x = x.expand([ins_kernel_feat.shape[0], 1, -1, -1])
-            coord_feat = torch.cat([x, y], 1)
-            ins_kernel_feat = torch.cat([ins_kernel_feat, coord_feat], 1)
-
-            # individual feature.
-            kernel_feat = ins_kernel_feat
-            seg_num_grid = self.num_grids[idx]
-            kernel_feat = F.interpolate(
-                kernel_feat, size=seg_num_grid, mode='bilinear')
-            cate_feat = kernel_feat[:, :-2, :, :]
-
-            # kernel
-            kernel_feat = self.kernel_tower(kernel_feat)
-            kernel_pred.append(self.kernel_pred(kernel_feat))
-
-            # cate
-            cate_feat = self.cate_tower(cate_feat)
-            cate_pred.append(self.cate_pred(cate_feat))
-        return cate_pred, kernel_pred
-
-
-class SOLOv2MaskHead(nn.Module):
-    def __init__(self, cfg, input_shape: List[ShapeSpec]):
-        """
-        SOLOv2 Mask Head.
-        """
-        super().__init__()
-        # fmt: off
-        self.mask_on = cfg.MODEL.MASK_ON
-        self.num_masks = cfg.MODEL.SOLOV2.NUM_MASKS
-        self.mask_in_features = cfg.MODEL.SOLOV2.MASK_IN_FEATURES
-        self.mask_in_channels = cfg.MODEL.SOLOV2.MASK_IN_CHANNELS
-        self.mask_channels = cfg.MODEL.SOLOV2.MASK_CHANNELS
-        self.num_levels = len(input_shape)
-        assert self.num_levels == len(self.mask_in_features), \
-            print("Input shape should match the features.")
-        # fmt: on
-        norm = None if cfg.MODEL.SOLOV2.NORM == "none" else cfg.MODEL.SOLOV2.NORM
-
-        self.convs_all_levels = nn.ModuleList()
-        for i in range(self.num_levels):
-            convs_per_level = nn.Sequential()
-            if i == 0:
-                conv_tower = list()
-                conv_tower.append(nn.Conv2d(
-                    self.mask_in_channels, self.mask_channels,
-                    kernel_size=3, stride=1,
-                    padding=1, bias=norm is None
-                ))
-                if norm == "GN":
-                    conv_tower.append(nn.GroupNorm(32, self.mask_channels))
-                conv_tower.append(nn.ReLU(inplace=False))
-                convs_per_level.add_module(
-                    'conv' + str(i), nn.Sequential(*conv_tower))
-                self.convs_all_levels.append(convs_per_level)
-                continue
-
-            for j in range(i):
-                if j == 0:
-                    chn = self.mask_in_channels + 2 if i == 3 else self.mask_in_channels
-                    conv_tower = list()
-                    conv_tower.append(nn.Conv2d(
-                        chn, self.mask_channels,
-                        kernel_size=3, stride=1,
-                        padding=1, bias=norm is None
-                    ))
-                    if norm == "GN":
-                        conv_tower.append(nn.GroupNorm(32, self.mask_channels))
-                    conv_tower.append(nn.ReLU(inplace=False))
-                    convs_per_level.add_module(
-                        'conv' + str(j), nn.Sequential(*conv_tower))
-                    upsample_tower = nn.Upsample(
-                        scale_factor=2, mode='bilinear', align_corners=False)
-                    convs_per_level.add_module(
-                        'upsample' + str(j), upsample_tower)
-                    continue
-                conv_tower = list()
-                conv_tower.append(nn.Conv2d(
-                    self.mask_channels, self.mask_channels,
-                    kernel_size=3, stride=1,
-                    padding=1, bias=norm is None
-                ))
-                if norm == "GN":
-                    conv_tower.append(nn.GroupNorm(32, self.mask_channels))
-                conv_tower.append(nn.ReLU(inplace=False))
-                convs_per_level.add_module(
-                    'conv' + str(j), nn.Sequential(*conv_tower))
-                upsample_tower = nn.Upsample(
-                    scale_factor=2, mode='bilinear', align_corners=False)
-                convs_per_level.add_module('upsample' + str(j), upsample_tower)
-
-            self.convs_all_levels.append(convs_per_level)
-
-        self.conv_pred = nn.Sequential(
-            nn.Conv2d(
-                self.mask_channels, self.num_masks,
-                kernel_size=1, stride=1,
-                padding=0, bias=norm is None),
-            nn.GroupNorm(32, self.num_masks),
-            nn.ReLU(inplace=True)
-        )
-
-        for modules in [self.convs_all_levels, self.conv_pred]:
-            for l in modules.modules():
-                if isinstance(l, nn.Conv2d):
-                    torch.nn.init.normal_(l.weight, std=0.01)
-                    if l.bias is not None:
-                        nn.init.constant_(l.bias, 0)
-
-    def forward(self, features):
-        """
-        Arguments:
-            features (list[Tensor]): FPN feature map tensors in high to low resolution.
-                Each tensor in the list correspond to different feature levels.
-
-        Returns:
-            pass
-        """
-        assert len(features) == self.num_levels, \
-            print("The number of input features should be equal to the supposed level.")
-
-        # bottom features first.
-        feature_add_all_level = self.convs_all_levels[0](features[0])
-        for i in range(1, self.num_levels):
-            mask_feat = features[i]
-            if i == 3:  # add for coord.
-                x_range = torch.linspace(-1, 1,
-                                         mask_feat.shape[-1], device=mask_feat.device)
-                y_range = torch.linspace(-1, 1,
-                                         mask_feat.shape[-2], device=mask_feat.device)
-                y, x = torch.meshgrid(y_range, x_range)
-                y = y.expand([mask_feat.shape[0], 1, -1, -1])
-                x = x.expand([mask_feat.shape[0], 1, -1, -1])
-                coord_feat = torch.cat([x, y], 1)
-                mask_feat = torch.cat([mask_feat, coord_feat], 1)
-            # add for top features.
-            feature_add_all_level += self.convs_all_levels[i](mask_feat)
-
-        mask_pred = self.conv_pred(feature_add_all_level)
-        return mask_pred
diff --git a/yolov7/modeling/meta_arch/sparseinst.py b/yolov7/modeling/meta_arch/sparseinst.py
new file mode 100644
index 0000000..8e06223
--- /dev/null
+++ b/yolov7/modeling/meta_arch/sparseinst.py
@@ -0,0 +1,342 @@
+# Copyright (c) Tianheng Cheng and its affiliates. All Rights Reserved
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from detectron2.modeling import build_backbone
+from detectron2.structures import ImageList, Instances, BitMasks
+from detectron2.modeling import META_ARCH_REGISTRY, build_backbone
+
+from yolov7.modeling.transcoders.encoder_sparseinst import build_sparse_inst_encoder
+from yolov7.modeling.transcoders.decoder_sparseinst import build_sparse_inst_decoder
+
+from ..loss.sparseinst_loss import build_sparse_inst_criterion
+
+# from .utils import nested_tensor_from_tensor_list
+from yolov7.utils.misc import nested_tensor_from_tensor_list
+from alfred.utils.log import logger
+from alfred import print_shape
+
+__all__ = ["SparseInst"]
+
+
+@torch.jit.script
+def rescoring_mask(scores, mask_pred, masks):
+    mask_pred_ = mask_pred.float()
+    return scores * ((masks * mask_pred_).sum([1, 2]) / (mask_pred_.sum([1, 2]) + 1e-6))
+
+
+def rescoring_mask_batch(scores, mask_pred, masks):
+    # scores and masks contains batch
+    print(f'mask_pred: {mask_pred.shape}, masks: {masks.shape}, scores: {scores.shape}')
+    mask_pred_ = mask_pred.float()
+
+    # # masks = (masks * mask_pred_).sum([2, 3])
+    # mask_pred2 = torch.sum(mask_pred_, [2, 3])
+    # mask_pred2 = mask_pred2 + 1e-6
+    # # masks_to_m = masks / mask_pred
+    # # print(masks_to_m.shape, scores.shape)
+    # # return scores * masks_to_m
+    # scores *= mask_pred2
+    # return scores
+
+    return scores * ((masks * mask_pred_).sum([2, 3]) / (mask_pred_.sum([2, 3]) + 1e-6))
+
+def batched_index_select(input, dim, index):
+    views = [1 if i != dim else -1 for i in range(len(input.shape))]
+    expanse = list(input.shape)
+    expanse[dim] = -1
+    index = index.view(views).expand(expanse)
+    # making the first dim of output be B
+    return torch.cat(torch.chunk(torch.gather(input, dim, index), chunks=index.shape[0], dim=dim), dim=0)
+
+@META_ARCH_REGISTRY.register()
+class SparseInst(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+
+        # move to target device
+        self.device = torch.device(cfg.MODEL.DEVICE)
+
+        # backbone
+        self.backbone = build_backbone(cfg)
+        self.size_divisibility = self.backbone.size_divisibility
+        output_shape = self.backbone.output_shape()
+
+        # encoder & decoder
+        self.encoder = build_sparse_inst_encoder(cfg, output_shape)
+        self.decoder = build_sparse_inst_decoder(cfg)
+
+        # matcher & loss (matcher is built in loss)
+        self.criterion = build_sparse_inst_criterion(cfg)
+
+        # data and preprocessing
+        self.mask_format = cfg.INPUT.MASK_FORMAT
+
+        self.pixel_mean = (
+            torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(3, 1, 1)
+        )
+        self.pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(3, 1, 1)
+        # only for onnx export
+        self.normalizer_trans = lambda x: (x - self.pixel_mean.unsqueeze(0)) / self.pixel_std.unsqueeze(0)
+
+        # inference
+        # self.cls_threshold = cfg.MODEL.SPARSE_INST.CLS_THRESHOLD
+        self.cls_threshold = cfg.MODEL.YOLO.CONF_THRESHOLD
+        self.mask_threshold = cfg.MODEL.SPARSE_INST.MASK_THRESHOLD
+        self.max_detections = cfg.MODEL.SPARSE_INST.MAX_DETECTIONS
+
+    def normalizer(self, image):
+        image = (image - self.pixel_mean) / self.pixel_std
+        return image
+
+    def preprocess_inputs(self, batched_inputs):
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [self.normalizer(x) for x in images]
+        images = ImageList.from_tensors(images, 32)
+        return images
+
+    def prepare_targets(self, targets):
+        new_targets = []
+        for targets_per_image in targets:
+            target = {}
+            gt_classes = targets_per_image.gt_classes
+            target["labels"] = gt_classes.to(self.device)
+            h, w = targets_per_image.image_size
+            if not targets_per_image.has("gt_masks"):
+                gt_masks = BitMasks(torch.empty(0, h, w))
+            else:
+                gt_masks = targets_per_image.gt_masks
+                if self.mask_format == "polygon":
+                    if len(gt_masks.polygons) == 0:
+                        gt_masks = BitMasks(torch.empty(0, h, w))
+                    else:
+                        gt_masks = BitMasks.from_polygon_masks(gt_masks.polygons, h, w)
+
+            target["masks"] = gt_masks.to(self.device)
+            new_targets.append(target)
+
+        return new_targets
+
+    def preprocess_inputs_onnx(self, x):
+        x = x.permute(0, 3, 1, 2)
+        x = self.normalizer_trans(x)
+        return x
+
+    def forward(self, batched_inputs):
+        if torch.onnx.is_in_onnx_export():
+            logger.info("[WARN] exporting onnx...")
+            assert isinstance(batched_inputs, (list, torch.Tensor)) or isinstance(
+                batched_inputs, list
+            ), "onnx export, batched_inputs only needs image tensor or list of tensors"
+            images = self.preprocess_inputs_onnx(batched_inputs)
+            logger.info(f'images onnx input: {images.shape}')
+        else:
+            images = self.preprocess_inputs(batched_inputs)
+
+        # if isinstance(images, (list, torch.Tensor)):
+        #     images = nested_tensor_from_tensor_list(images)
+
+        if isinstance(images, ImageList):
+            max_shape = images.tensor.shape[2:]
+            features = self.backbone(images.tensor)
+        else:
+            # onnx trace
+            max_shape = images.shape[2:]
+            features = self.backbone(images)
+
+        features = self.encoder(features)
+        output = self.decoder(features)
+
+        if self.training:
+            gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+            targets = self.prepare_targets(gt_instances)
+            losses = self.criterion(output, targets, max_shape)
+            return losses
+        else:
+            if torch.onnx.is_in_onnx_export():
+                results = self.inference_onnx(
+                    output, batched_inputs, max_shape
+                )
+                return results
+            else:
+                results = self.inference(
+                    output, batched_inputs, max_shape, images.image_sizes
+                )
+            processed_results = [{"instances": r} for r in results]
+            return processed_results
+
+    def forward_test(self, images):
+        pass
+
+    def inference(self, output, batched_inputs, max_shape, image_sizes):
+        # max_detections = self.max_detections
+        results = []
+        pred_scores = output["pred_logits"].sigmoid()
+        pred_masks = output["pred_masks"].sigmoid()
+        pred_objectness = output["pred_scores"].sigmoid()
+        pred_scores = torch.sqrt(pred_scores * pred_objectness)
+
+        for _, (
+            scores_per_image,
+            mask_pred_per_image,
+            batched_input,
+            img_shape,
+        ) in enumerate(zip(pred_scores, pred_masks, batched_inputs, image_sizes)):
+
+            ori_shape = (batched_input["height"], batched_input["width"])
+            result = Instances(ori_shape)
+            # max/argmax
+            scores, labels = scores_per_image.max(dim=-1)
+            # cls threshold
+            keep = scores > self.cls_threshold
+            scores = scores[keep]
+            labels = labels[keep]
+            mask_pred_per_image = mask_pred_per_image[keep]
+
+            if scores.size(0) == 0:
+                result.scores = scores
+                result.pred_classes = labels
+                results.append(result)
+                continue
+
+            h, w = img_shape
+            # rescoring mask using maskness
+            scores = rescoring_mask(
+                scores, mask_pred_per_image > self.mask_threshold, mask_pred_per_image
+            )
+
+            # upsample the masks to the original resolution:
+            # (1) upsampling the masks to the padded inputs, remove the padding area
+            # (2) upsampling/downsampling the masks to the original sizes
+            mask_pred_per_image = F.interpolate(
+                mask_pred_per_image.unsqueeze(1),
+                size=max_shape,
+                mode="bilinear",
+                align_corners=False,
+            )[:, :, :h, :w]
+            mask_pred_per_image = F.interpolate(
+                mask_pred_per_image,
+                size=ori_shape,
+                mode="bilinear",
+                align_corners=False,
+            ).squeeze(1)
+
+            mask_pred = mask_pred_per_image > self.mask_threshold
+            # mask_pred = BitMasks(mask_pred)
+
+            # using Detectron2 Instances to store the final results
+            result.pred_masks = mask_pred
+            result.scores = scores
+            result.pred_classes = labels
+            results.append(result)
+        return results
+
+    def inference_onnx(self, output, batched_inputs, max_shape):
+        from alfred import print_shape
+        # max_detections = self.max_detections
+        pred_scores = output["pred_logits"].sigmoid()
+        pred_masks = output["pred_masks"].sigmoid()
+        pred_objectness = output["pred_scores"].sigmoid()
+        # solve Nan problems with a minimal epsilon
+        pred_scores = torch.sqrt(pred_scores * pred_objectness + 1e-3)
+        print(f'pred_scores: {pred_scores.shape}, perd_masks: {pred_masks.shape}')
+
+        # all_scores = []
+        # all_labels = []
+        # all_masks = []
+        # print('max_shape: ', max_shape)
+
+        # for _, (
+        #     scores_per_image,
+        #     mask_pred_per_image,
+        #     batched_input,
+        # ) in enumerate(zip(pred_scores, pred_masks, batched_inputs)):
+
+        #     # max/argmax
+        #     scores, labels = torch.max(scores_per_image, dim=scores_per_image.dim()-1)
+        #     # cls threshold
+        #     # keep = scores > self.cls_threshold
+        #     _, keep = torch.topk(scores, k=50)
+        #     print(keep.shape, scores.shape)
+        #     scores = scores[keep]
+        #     labels = labels[keep]
+        #     # print(scores, labels)
+        #     mask_pred_per_image = mask_pred_per_image[keep]
+            
+        #     h, w = max_shape
+        #     # rescoring mask using maskness
+        #     scores = rescoring_mask(
+        #         scores, mask_pred_per_image > self.mask_threshold, mask_pred_per_image
+        #     )
+
+        #     # upsample the masks to the original resolution:
+        #     # (1) upsampling the masks to the padded inputs, remove the padding area
+        #     # (2) upsampling/downsampling the masks to the original sizes
+        #     print('mask_pred_per_image: ', mask_pred_per_image.shape)
+        #     mask_pred_per_image = F.interpolate(
+        #         mask_pred_per_image.unsqueeze(1),
+        #         size=max_shape,
+        #         mode="bilinear",
+        #         align_corners=False,
+        #     )[:, :h, :w]
+
+        #     mask_pred = mask_pred_per_image > self.mask_threshold
+
+        #     all_masks.append(mask_pred)
+        #     all_scores.append(scores)
+        #     all_labels.append(labels)
+
+        # do it in batch
+        # max/argmax
+        scores, labels = torch.max(pred_scores, dim=pred_scores.dim()-1)
+        K_ = min(50, self.max_detections)
+        _, keep = torch.topk(scores, k=K_)
+        print(keep.shape, scores.shape)
+        keep_flt = keep.view(-1, K_)
+        scores = scores.view(-1)
+        labels = labels.view(-1)
+        scores = scores[keep_flt]
+        labels = labels[keep_flt]
+
+        # advanced select
+        pred_masks = pred_masks.view(-1, pred_masks.shape[-2], pred_masks.shape[-1])
+        print_shape(keep_flt, pred_masks)
+        mask_pred_batch = pred_masks[keep_flt] # 1, 100, 160, 160
+        
+        h, w = max_shape
+        # rescoring mask using maskness
+        scores = rescoring_mask_batch(
+            scores, mask_pred_batch > self.mask_threshold, mask_pred_batch
+        )
+
+
+
+        # upsample the masks to the original resolution:
+        # (1) upsampling the masks to the padded inputs, remove the padding area
+        # (2) upsampling/downsampling the masks to the original sizes
+        print('mask_pred_per_image: ', mask_pred_batch.shape)
+        mask_pred_batch = F.interpolate(
+            mask_pred_batch,
+            size=max_shape,
+            mode="bilinear",
+            align_corners=False,
+        )[:, :h, :w]
+        mask_pred = mask_pred_batch > self.mask_threshold
+
+    
+        # do scores here
+        # masks_values = mask_pred.float() * mask_pred_batch
+        # masks_values = torch.sum(masks_values, [-2, -1])
+        # scores = scores * masks_values
+        # print(scores, labels)
+   
+        # all_masks = torch.stack(all_masks).to(torch.long)
+        # all_scores = torch.stack(all_scores)
+        # all_labels = torch.stack(all_labels)
+        # # logger.info(f'all_scores: {all_scores.shape}')
+        # # logger.info(f'all_labels: {all_labels.shape}')
+        # logger.info(f'all_masks: {all_masks.shape}')
+        # return all_masks, all_scores, all_labels
+        return mask_pred, scores, labels
diff --git a/yolov7/modeling/meta_arch/utils.py b/yolov7/modeling/meta_arch/utils.py
old mode 100755
new mode 100644
diff --git a/yolov7/modeling/meta_arch/vidtplus.py b/yolov7/modeling/meta_arch/vidtplus.py
new file mode 100644
index 0000000..c6b5c44
--- /dev/null
+++ b/yolov7/modeling/meta_arch/vidtplus.py
@@ -0,0 +1,5 @@
+'''
+https://github.com/naver-ai/vidt/tree/vidt-plus
+
+swin-nano with mAP 45, while about 20FPS
+'''
diff --git a/yolov7/modeling/meta_arch/yolo.py b/yolov7/modeling/meta_arch/yolo.py
old mode 100755
new mode 100644
diff --git a/yolov7/modeling/meta_arch/yolof.py b/yolov7/modeling/meta_arch/yolof.py
old mode 100755
new mode 100644
diff --git a/yolov7/modeling/meta_arch/yolomask.py b/yolov7/modeling/meta_arch/yolomask.py
old mode 100755
new mode 100644
index a28e477..917b3e1
--- a/yolov7/modeling/meta_arch/yolomask.py
+++ b/yolov7/modeling/meta_arch/yolomask.py
@@ -33,8 +33,6 @@
 from alfred.vis.image.det import visualize_det_cv2_part, visualize_det_cv2_fancy
 from alfred.vis.image.mask import label2color_mask, vis_bitmasks
 
-from nb.torch.blocks.head_blocks import SPP, PANet
-
 from ..neck.yolo_pafpn import YOLOPAFPN
 from yolov7.utils.boxes import postprocess, bbox_ious2, BoxModeMy, postprocessv5, anchor_ious, postprocess_yolomask
 from ..backbone.layers.wrappers import BaseConv, NearestUpsample, ConvBNRelu
diff --git a/yolov7/modeling/meta_arch/yolov5.py b/yolov7/modeling/meta_arch/yolov5.py
old mode 100755
new mode 100644
index 18d82a2..a28ec3f
--- a/yolov7/modeling/meta_arch/yolov5.py
+++ b/yolov7/modeling/meta_arch/yolov5.py
@@ -34,8 +34,6 @@
 from alfred.dl.torch.common import print_tensor, device
 from alfred.vis.image.det import visualize_det_cv2_part, visualize_det_cv2_fancy
 
-from nb.torch.blocks.head_blocks import SPP, PANet
-
 from ..neck.yolo_pafpn import YOLOPAFPN
 from yolov7.utils.boxes import postprocess, bboxes_iou, BoxModeMy, postprocessv5
 import time
diff --git a/yolov7/modeling/meta_arch/yolov6.py b/yolov7/modeling/meta_arch/yolov6.py
new file mode 100644
index 0000000..0e6d22a
--- /dev/null
+++ b/yolov7/modeling/meta_arch/yolov6.py
@@ -0,0 +1,292 @@
+import torch.nn as nn
+import torch
+from detectron2.modeling import META_ARCH_REGISTRY
+from detectron2.modeling.backbone import build_backbone
+
+from detectron2.structures import Boxes, ImageList, Instances, image_list
+from detectron2.utils import comm
+from detectron2.utils.logger import log_first_n
+from detectron2.modeling.postprocessing import detector_postprocess
+
+import torch.distributed as dist
+
+import numpy as np
+import time
+import logging
+from alfred.utils.log import logger
+
+from ..head.yolox_head import YOLOXHead
+from ..head.yolov6_head import YOLOv6Head
+from ..neck.yolo_pafpn import YOLOPAFPN
+from ..neck.reppan import RepPANNeck
+
+from yolov7.utils.boxes import postprocess, BoxModeMy
+
+
+"""
+Implementation of YOLOv6
+
+"""
+
+
+@META_ARCH_REGISTRY.register()
+class YOLOV6(nn.Module):
+    def __init__(self, cfg):
+        super(YOLOV6, self).__init__()
+        # configurations
+        self.device = torch.device(cfg.MODEL.DEVICE)
+        self.conf_threshold = cfg.MODEL.YOLO.CONF_THRESHOLD
+        self.nms_threshold = cfg.MODEL.YOLO.NMS_THRESHOLD
+        self.nms_type = cfg.MODEL.NMS_TYPE
+        self.loss_type = cfg.MODEL.YOLO.LOSS_TYPE
+        self.head_type = cfg.MODEL.YOLO.HEAD.TYPE
+
+        # l1 loss will open at last 15 epochs
+        self.use_l1 = False
+
+        self.depth_mul = cfg.MODEL.YOLO.DEPTH_MUL
+        self.width_mul = cfg.MODEL.YOLO.WIDTH_MUL
+
+        self.iter = 0
+        self.max_iter = cfg.SOLVER.MAX_ITER
+        self.enable_l1_loss_at = cfg.INPUT.MOSAIC_AND_MIXUP.DISABLE_AT_ITER
+        self.num_classes = cfg.MODEL.YOLO.CLASSES
+        self.max_boxes_num = cfg.MODEL.YOLO.MAX_BOXES_NUM
+        self.in_features = cfg.MODEL.YOLO.IN_FEATURES
+        self.neck_type = cfg.MODEL.YOLO.NECK.TYPE
+
+        self.backbone = build_backbone(cfg)
+        backbone_shape = self.backbone.output_shape()
+        self.size_divisibility = (
+            32
+            if self.backbone.size_divisibility == 0
+            else self.backbone.size_divisibility
+        )
+        backbone_shape = [backbone_shape[i].channels for i in self.in_features]
+        logger.info(
+            "backboneshape: {}, size_divisibility: {}".format(
+                backbone_shape, self.size_divisibility
+            )
+        )
+
+        # don't specific in_channels, let it calculate
+
+        if self.neck_type == "reppan":
+            self.neck = RepPANNeck(
+                channels_list=self.backbone.channels_list,
+                num_repeats=self.backbone.num_repeats,
+                in_features=self.in_features,
+            )
+            logger.warning("Using YOLOv6 RepPAN neck!")
+        else:
+            self.neck = YOLOPAFPN(
+                depth=self.depth_mul, width=self.width_mul, in_features=self.in_features
+            )
+
+        if self.head_type == "yolov6":
+            self.head = YOLOv6Head(
+                self.num_classes, channels_list=self.backbone.channels_list
+            )
+        else:
+            self.head = YOLOXHead(
+                self.num_classes, width=self.width_mul, in_channels=backbone_shape
+            )
+
+        pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(3, 1, 1)
+        pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(3, 1, 1)
+        self.padded_value = cfg.MODEL.PADDED_VALUE
+        self.normalizer = lambda x: (x / 255.0 - pixel_mean) / pixel_std
+        self.to(self.device)
+        self.onnx_export = False
+        self.onnx_vis = False
+
+        self.apply(self._init_model)
+        self.head.initialize_biases(1e-2)
+
+    @staticmethod
+    def _init_model(M):
+        for m in M.modules():
+            if isinstance(m, nn.BatchNorm2d):
+                m.eps = 1e-3
+                m.momentum = 0.03
+
+    def update_iter(self, i):
+        self.iter = i
+
+    def preprocess_image(self, batched_inputs, training):
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        bs = len(images)
+        # images = [self.normalizer(x) for x in images]
+        images = [x.type(torch.float) for x in images]
+
+        images = ImageList.from_tensors(
+            images,
+            size_divisibility=self.size_divisibility,
+            pad_value=self.padded_value,
+        )
+        # logger.info('images ori shape: {}'.format(images.tensor.shape))
+
+        if training and self.iter > self.enable_l1_loss_at and not self.use_l1:
+            meg = torch.BoolTensor(1).to(self.device)
+            if comm.is_main_process():
+                logger.info("[master] enable l1 loss now at iter: {}".format(self.iter))
+                # enable l1 loss at last 50000 iterations
+                meg.fill_(True)
+
+            if comm.get_world_size() > 1:
+                comm.synchronize()
+                if comm.is_main_process():
+                    dist.broadcast(meg, 0)
+            self.head.use_l1 = meg.item()
+            self.use_l1 = meg.item()
+            comm.synchronize()
+            logger.info("check head l1: {}".format(self.head.use_l1))
+
+        if training:
+            if "instances" in batched_inputs[0]:
+                gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+            elif "targets" in batched_inputs[0]:
+                log_first_n(
+                    logging.WARN,
+                    "'targets' in the model inputs is now renamed to 'instances'!",
+                    n=10,
+                )
+                gt_instances = [x["targets"].to(self.device) for x in batched_inputs]
+            else:
+                gt_instances = None
+
+            if gt_instances:
+                for i in gt_instances:
+                    i.gt_boxes.tensor = BoxModeMy.convert(
+                        i.gt_boxes.tensor,
+                        from_mode=BoxModeMy.XYXY_ABS,
+                        to_mode=BoxModeMy.XYWH_ABS,
+                    )
+
+            targets = [
+                torch.cat(
+                    # YOLOX using [cls, box], box is cx cy w h
+                    [
+                        instance.gt_classes.float().unsqueeze(-1),
+                        instance.gt_boxes.tensor,
+                    ],
+                    dim=-1
+                    # [instance.gt_boxes.tensor, instance.gt_classes.float().unsqueeze(-1), ], dim=-1
+                )
+                for instance in gt_instances
+            ]
+
+            labels = torch.zeros((bs, self.max_boxes_num, 5))
+            # first dim assign -1 for none-classes
+            labels[:, :, 0] = -1
+            for i, target in enumerate(targets):
+                if target.shape[0] > self.max_boxes_num:
+                    target = target[: self.max_boxes_num, :]
+                labels[i][: target.shape[0]] = target
+        else:
+            labels = None
+
+        # self.iter += 1
+        return images, labels, images.image_sizes
+
+    def preprocess_input(self, x):
+        x = x.permute(0, 3, 1, 2)
+        # x = F.interpolate(x, size=(640, 640))
+        # x = F.interpolate(x, size=(512, 960))
+        # x = self.normalizer(x)
+        return x
+
+    def forward(self, batched_inputs):
+        if self.onnx_export:
+            logger.info("[WARN] exporting onnx...")
+            assert isinstance(batched_inputs, torch.Tensor) or isinstance(
+                batched_inputs, list
+            ), "onnx export, batched_inputs only needs image tensor"
+            x = self.preprocess_input(batched_inputs)
+            # batched_inputs = batched_inputs.permute(0, 3, 1, 2)
+            image_ori_sizes = [batched_inputs.shape[1:3]]
+        else:
+            images, labels, image_ori_sizes = self.preprocess_image(
+                batched_inputs, self.training
+            )
+            if labels is not None:
+                labels = labels.to(images.device)
+
+            x = images.tensor
+            img_size = x.shape[-2:]
+            # logger.info('img size: {}'.format(img_size))
+
+        if self.eval:
+            t0 = time.time()
+
+        out_features = self.backbone(x)
+        # for k, v in out_features.items():
+        #     print(k, v.shape)
+        fpn_outs = self.neck(out_features)  # 512, 1024, 2048, s, m, l
+        # for i in fpn_outs:
+        #     print(i.shape)
+
+        if self.training:
+            # print(labels)
+            loss, iou_loss, conf_loss, cls_loss, l1_loss, num_fg = self.head(
+                fpn_outs, labels, x
+            )
+
+            outputs = {
+                "total_loss": loss,
+                "iou_loss": iou_loss,
+                "conf_loss": conf_loss,
+                "cls_loss": cls_loss,
+            }
+            if self.use_l1:
+                outputs["l1_loss"] = l1_loss
+            return outputs
+        else:
+            if self.onnx_export:
+                if not self.onnx_vis:
+                    # self.head.decode_in_inference = False
+                    self.head.decode_in_inference = True
+                    self.head.onnx_export = True
+                    # we wrap box decode into onnx model as well
+                    outputs = self.head(fpn_outs)
+                    return outputs
+                else:
+                    self.head.decode_in_inference = True
+                    outputs = self.head(fpn_outs)
+                    detections = postprocess(
+                        outputs,
+                        self.num_classes,
+                        self.conf_threshold,
+                        self.nms_threshold,
+                    )
+                    return detections
+            else:
+                outputs = self.head(fpn_outs)
+
+                t1 = time.time()
+
+                detections = postprocess(
+                    outputs, self.num_classes, self.conf_threshold, self.nms_threshold
+                )
+
+                results = []
+                for idx, out in enumerate(detections):
+                    if out is None:
+                        out = x.new_zeros((0, 7))
+                    image_size = image_ori_sizes[idx]
+                    result = Instances(image_size)
+                    result.pred_boxes = Boxes(out[:, :4])
+                    result.scores = out[:, 5] * out[:, 4]
+                    result.pred_classes = out[:, -1]
+                    results.append(result)
+
+                processed_results = []
+                for results_per_image, input_per_image, image_size in zip(
+                    results, batched_inputs, images.image_sizes
+                ):
+                    height = input_per_image.get("height", image_size[0])
+                    width = input_per_image.get("width", image_size[1])
+                    r = detector_postprocess(results_per_image, height, width)
+                    processed_results.append({"instances": r})
+                # return processed_results, t1 - t0
+                return processed_results
diff --git a/yolov7/modeling/meta_arch/yolov7.py b/yolov7/modeling/meta_arch/yolov7.py
old mode 100755
new mode 100644
index 57f5337..694a8fe
--- a/yolov7/modeling/meta_arch/yolov7.py
+++ b/yolov7/modeling/meta_arch/yolov7.py
@@ -1,4 +1,3 @@
-
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 # Copyright (c) BaseDetection, Inc. and its affiliates.
@@ -17,7 +16,12 @@
 import torch.nn.functional as F
 from detectron2.modeling.meta_arch import build
 from detectron2.layers import ShapeSpec
-from detectron2.modeling import BACKBONE_REGISTRY, ResNet, ResNetBlockBase, META_ARCH_REGISTRY
+from detectron2.modeling import (
+    BACKBONE_REGISTRY,
+    ResNet,
+    ResNetBlockBase,
+    META_ARCH_REGISTRY,
+)
 from detectron2.modeling.postprocessing import detector_postprocess
 from detectron2.structures import Boxes, ImageList, Instances, boxes, image_list
 from detectron2.utils import comm
@@ -31,7 +35,6 @@
 from .utils import generalized_batched_nms
 
 from yolov7.utils.boxes import postprocess, bboxes_iou
-from nb.torch.blocks.head_blocks import SPP, PANet
 from alfred.vis.image.det import visualize_det_cv2_part, visualize_det_cv2_fancy
 
 from yolov7.modeling.neck.yolo_fpn import YOLOFPN
@@ -39,8 +42,15 @@
 
 
 __all__ = ["YOLOV7", "YOLOHead"]
-supported_backbones = ['resnet', 'res2net', 'regnet',
-                       'swin', 'efficient', 'darknet', 'pvt']
+supported_backbones = [
+    "resnet",
+    "res2net",
+    "regnet",
+    "swin",
+    "efficient",
+    "darknet",
+    "pvt",
+]
 
 
 @META_ARCH_REGISTRY.register()
@@ -71,17 +81,22 @@ def __init__(self, cfg):
         self.change_iter = 10
         self.iter = 0
 
-        assert len([i for i in supported_backbones if i in cfg.MODEL.BACKBONE.NAME]
-                   ) > 0, 'Only {} supported.'.format(supported_backbones)
+        assert (
+            len([i for i in supported_backbones if i in cfg.MODEL.BACKBONE.NAME]) > 0
+        ), "Only {} supported.".format(supported_backbones)
 
         self.backbone = build_backbone(cfg)
         backbone_shape = self.backbone.output_shape()
-        self.size_divisibility = 32 if self.backbone.size_divisibility == 0 else self.backbone.size_divisibility
+        self.size_divisibility = (
+            32
+            if self.backbone.size_divisibility == 0
+            else self.backbone.size_divisibility
+        )
         backbone_shape = [backbone_shape[i].channels for i in self.in_features]
 
         if comm.is_main_process():
-            logger.info('YOLO.ANCHORS: {}'.format(cfg.MODEL.YOLO.ANCHORS))
-            logger.info('backboneshape: {}'.format(backbone_shape))
+            logger.info("YOLO.ANCHORS: {}".format(cfg.MODEL.YOLO.ANCHORS))
+            logger.info("backboneshape: {}".format(backbone_shape))
 
         # todo: wrap this to neck, support SPP , DarkNeck, PAN
 
@@ -107,56 +122,100 @@ def __init__(self, cfg):
         # self.out2 = self._make_embedding(
         #     [128, 256], backbone_shape[-3] + 128, out_filter_2)
 
-        if self.neck_type == 'fpn':
-            self.neck = YOLOFPN(width=self.width_mul, in_channels=backbone_shape,
-                                in_features=self.in_features, with_spp=self.with_spp)
+        if self.neck_type == "fpn":
+            self.neck = YOLOFPN(
+                width=self.width_mul,
+                in_channels=backbone_shape,
+                in_features=self.in_features,
+                with_spp=self.with_spp,
+            )
             # 256, 512, 1024 -> 1024, 512, 256
-            self.m = nn.ModuleList(nn.Conv2d(x, len(
-                cfg.MODEL.YOLO.ANCHORS[0]) * (5 + cfg.MODEL.YOLO.CLASSES), 1) for x in self.neck.out_channels)
-        elif self.neck_type == 'pafpn':
+            self.m = nn.ModuleList(
+                nn.Conv2d(
+                    x, len(cfg.MODEL.YOLO.ANCHORS[0]) * (5 + cfg.MODEL.YOLO.CLASSES), 1
+                )
+                for x in self.neck.out_channels
+            )
+        elif self.neck_type == "pafpn":
             width_mul = backbone_shape[0] / 256
             self.neck = YOLOPAFPN(
-                depth=self.depth_mul, width=width_mul, in_features=self.in_features)
-            self.m = nn.ModuleList(nn.Conv2d(x, len(
-                cfg.MODEL.YOLO.ANCHORS[0]) * (5 + cfg.MODEL.YOLO.CLASSES), 1) for x in backbone_shape)
+                depth=self.depth_mul, width=width_mul, in_features=self.in_features
+            )
+            self.m = nn.ModuleList(
+                nn.Conv2d(
+                    x, len(cfg.MODEL.YOLO.ANCHORS[0]) * (5 + cfg.MODEL.YOLO.CLASSES), 1
+                )
+                for x in backbone_shape
+            )
         else:
-            logger.error(
-                '{} neck not supported for now.'.format(self.neck_type))
-
-        pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(
-            3, 1, 1)
-        pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(
-            3, 1, 1)
-        self.normalizer = lambda x: (x / 255. - pixel_mean) / pixel_std
+            logger.info(f"type: {self.neck_type} not valid, using default FPN neck.")
+            self.neck = YOLOFPN(
+                width=self.width_mul,
+                in_channels=backbone_shape,
+                in_features=self.in_features,
+                with_spp=self.with_spp,
+            )
+            # 256, 512, 1024 -> 1024, 512, 256
+            self.m = nn.ModuleList(
+                nn.Conv2d(
+                    x, len(cfg.MODEL.YOLO.ANCHORS[0]) * (5 + cfg.MODEL.YOLO.CLASSES), 1
+                )
+                for x in self.neck.out_channels
+            )
+
+        pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(3, 1, 1)
+        pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(3, 1, 1)
+        self.normalizer = lambda x: (x / 255.0 - pixel_mean) / pixel_std
         self.padded_value = cfg.MODEL.PADDED_VALUE
         self.loss_evaluators = [
-            YOLOHead(cfg, anchor, level) for level, anchor in enumerate(cfg.MODEL.YOLO.ANCHORS)]
+            YOLOHead(cfg, anchor, level)
+            for level, anchor in enumerate(cfg.MODEL.YOLO.ANCHORS)
+        ]
         self.to(self.device)
 
     def update_iter(self, i):
         self.iter = i
 
     def _make_cbl(self, _in, _out, ks):
-        ''' cbl = conv + batch_norm + leaky_relu
-        '''
+        """cbl = conv + batch_norm + leaky_relu"""
         pad = (ks - 1) // 2 if ks else 0
-        return nn.Sequential(OrderedDict([
-            ("conv", nn.Conv2d(_in, _out, kernel_size=ks,
-                               stride=1, padding=pad, bias=False)),
-            ("bn", nn.BatchNorm2d(_out)),
-            ("relu", nn.LeakyReLU(0.1)),
-        ]))
+        return nn.Sequential(
+            OrderedDict(
+                [
+                    (
+                        "conv",
+                        nn.Conv2d(
+                            _in, _out, kernel_size=ks, stride=1, padding=pad, bias=False
+                        ),
+                    ),
+                    ("bn", nn.BatchNorm2d(_out)),
+                    ("relu", nn.LeakyReLU(0.1)),
+                ]
+            )
+        )
 
     def _make_embedding(self, filters_list, in_filters, out_filter):
-        m = nn.ModuleList([
-            self._make_cbl(in_filters, filters_list[0], 1),
-            self._make_cbl(filters_list[0], filters_list[1], 3),
-            self._make_cbl(filters_list[1], filters_list[0], 1),
-            self._make_cbl(filters_list[0], filters_list[1], 3),
-            self._make_cbl(filters_list[1], filters_list[0], 1),
-            self._make_cbl(filters_list[0], filters_list[1], 3)])
-        m.add_module("conv_out", nn.Conv2d(filters_list[1], out_filter, kernel_size=1,
-                                           stride=1, padding=0, bias=True))
+        m = nn.ModuleList(
+            [
+                self._make_cbl(in_filters, filters_list[0], 1),
+                self._make_cbl(filters_list[0], filters_list[1], 3),
+                self._make_cbl(filters_list[1], filters_list[0], 1),
+                self._make_cbl(filters_list[0], filters_list[1], 3),
+                self._make_cbl(filters_list[1], filters_list[0], 1),
+                self._make_cbl(filters_list[0], filters_list[1], 3),
+            ]
+        )
+        m.add_module(
+            "conv_out",
+            nn.Conv2d(
+                filters_list[1],
+                out_filter,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=True,
+            ),
+        )
         return m
 
     def preprocess_image(self, batched_inputs, training):
@@ -173,7 +232,10 @@ def preprocess_image(self, batched_inputs, training):
         images = [self.normalizer(x) for x in images]
 
         images = ImageList.from_tensors(
-            images, size_divisibility=self.size_divisibility, pad_value=self.padded_value/255.)
+            images,
+            size_divisibility=self.size_divisibility,
+            pad_value=self.padded_value / 255.0,
+        )
         # logger.info('images ori shape: {}'.format(images.tensor.shape))
         # logger.info('images ori shape: {}'.format(images.image_sizes))
 
@@ -183,8 +245,7 @@ def preprocess_image(self, batched_inputs, training):
             meg = torch.BoolTensor(1).to(self.device)
             comm.synchronize()
             if comm.is_main_process():
-                logger.info(
-                    '[master] enable l1 loss now at iter: {}'.format(self.iter))
+                logger.info("[master] enable l1 loss now at iter: {}".format(self.iter))
                 # enable l1 loss at last 50000 iterations
                 meg.fill_(True)
 
@@ -197,23 +258,24 @@ def preprocess_image(self, batched_inputs, training):
 
         if training:
             if "instances" in batched_inputs[0]:
-                gt_instances = [
-                    x["instances"].to(self.device) for x in batched_inputs
-                ]
+                gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
             elif "targets" in batched_inputs[0]:
                 log_first_n(
                     logging.WARN,
                     "'targets' in the model inputs is now renamed to 'instances'!",
-                    n=10)
-                gt_instances = [
-                    x["targets"].to(self.device) for x in batched_inputs
-                ]
+                    n=10,
+                )
+                gt_instances = [x["targets"].to(self.device) for x in batched_inputs]
             else:
                 gt_instances = None
 
             targets = [
                 torch.cat(
-                    [instance.gt_classes.float().unsqueeze(-1), instance.gt_boxes.tensor], dim=-1
+                    [
+                        instance.gt_classes.float().unsqueeze(-1),
+                        instance.gt_boxes.tensor,
+                    ],
+                    dim=-1,
                 )
                 for instance in gt_instances
             ]
@@ -221,8 +283,8 @@ def preprocess_image(self, batched_inputs, training):
             # todo: what if targets more than max_boxes_num?
             for i, target in enumerate(targets):
                 if target.shape[0] > self.max_boxes_num:
-                    target = target[:self.max_boxes_num, :]
-                labels[i][:target.shape[0]] = target
+                    target = target[: self.max_boxes_num, :]
+                labels[i][: target.shape[0]] = target
             labels[:, :, 1:] = labels[:, :, 1:]
         else:
             labels = None
@@ -231,7 +293,8 @@ def preprocess_image(self, batched_inputs, training):
 
     def forward(self, batched_inputs):
         images, labels, image_ori_sizes = self.preprocess_image(
-            batched_inputs, self.training)
+            batched_inputs, self.training
+        )
 
         # batched_inputs[0]['image'] = images.tensor[0].cpu() * 255
         # self.visualize_data(batched_inputs[0])
@@ -277,28 +340,27 @@ def _branch(_embedding, _in):
 
         if self.training:
             losses = [
-                loss_evaluator(out, labels, img_size) for out, loss_evaluator in zip(
-                    outs, self.loss_evaluators)
+                loss_evaluator(out, labels, img_size)
+                for out, loss_evaluator in zip(outs, self.loss_evaluators)
             ]
             if self.loss_type == "v7":
-                keys = ["loss_iou", "loss_xy",
-                        "loss_wh", "loss_conf", "loss_cls"]
+                keys = ["loss_iou", "loss_xy", "loss_wh", "loss_conf", "loss_cls"]
             else:
-                keys = ["loss_x", "loss_y", "loss_w",
-                        "loss_h", "loss_conf", "loss_cls"]
+                keys = ["loss_x", "loss_y", "loss_w", "loss_h", "loss_conf", "loss_cls"]
             losses_dict = {}
             for key in keys:
                 losses_dict[key] = sum([loss[key] for loss in losses])
             return losses_dict
         else:
-            predictions_list = [loss_evaluator(out, labels, img_size) for
-                                out, loss_evaluator in zip(outs, self.loss_evaluators)]
+            predictions_list = [
+                loss_evaluator(out, labels, img_size)
+                for out, loss_evaluator in zip(outs, self.loss_evaluators)
+            ]
 
             predictions = torch.cat(predictions_list, 1)
-            detections = postprocess(predictions,
-                                     self.num_classes,
-                                     self.conf_threshold,
-                                     self.nms_threshold)
+            detections = postprocess(
+                predictions, self.num_classes, self.conf_threshold, self.nms_threshold
+            )
 
             results = []
             for idx, out in enumerate(detections):
@@ -314,7 +376,8 @@ def _branch(_embedding, _in):
 
             processed_results = []
             for results_per_image, input_per_image, image_size in zip(
-                    results, batched_inputs, images.image_sizes):
+                results, batched_inputs, images.image_sizes
+            ):
                 height = input_per_image.get("height", image_size[0])
                 width = input_per_image.get("width", image_size[1])
                 r = detector_postprocess(results_per_image, height, width)
@@ -356,10 +419,8 @@ def __init__(self, cfg, anchors, level):
         self.l1_loss = nn.L1Loss(reduction="none")
         self.bce_loss = nn.BCELoss(reduction="none")
 
-        self.BCEcls = nn.BCEWithLogitsLoss(
-            pos_weight=torch.tensor([1.0]).to(device))
-        self.BCEobj = nn.BCEWithLogitsLoss(
-            pos_weight=torch.tensor([1.0]).to(device))
+        self.BCEcls = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([1.0]).to(device))
+        self.BCEobj = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([1.0]).to(device))
 
         self.bce_obj = nn.BCEWithLogitsLoss(reduction="none")
         self.bce_cls = nn.BCEWithLogitsLoss(reduction="none")
@@ -373,38 +434,50 @@ def forward(self, input, targets=None, image_size=(416, 416)):
         stride_h = image_size[0] / in_h
         stride_w = image_size[1] / in_w
 
-        scaled_anchors = [(a_w, a_h)
-                          for a_w, a_h in self.anchors]
+        scaled_anchors = [(a_w, a_h) for a_w, a_h in self.anchors]
 
-        prediction = input.view(bs, self.num_anchors,
-                                self.bbox_attrs, in_h, in_w).permute(0, 1, 3, 4, 2).contiguous()  # place bbox_attr to last order
+        prediction = (
+            input.view(bs, self.num_anchors, self.bbox_attrs, in_h, in_w)
+            .permute(0, 1, 3, 4, 2)
+            .contiguous()
+        )  # place bbox_attr to last order
 
         # Get outputs
-        x = torch.sigmoid(prediction[..., 0])          # Center x
-        y = torch.sigmoid(prediction[..., 1])          # Center y
-        w = prediction[..., 2]                       # Width
-        h = prediction[..., 3]                       # Height
+        x = torch.sigmoid(prediction[..., 0])  # Center x
+        y = torch.sigmoid(prediction[..., 1])  # Center y
+        w = prediction[..., 2]  # Width
+        h = prediction[..., 3]  # Height
 
         # conf = torch.sigmoid(prediction[..., 4])       # Conf
         # pred_cls = torch.sigmoid(prediction[..., 5:])  # Cls pred.
-        conf = prediction[..., 4]       # Conf
+        conf = prediction[..., 4]  # Conf
         pred_cls = prediction[..., 5:]  # Cls pred.
 
-        def FloatTensor(x): return torch.FloatTensor(x).to(pred_cls.device)  # noqa
-        def LongTensor(x): return torch.LongTensor(x).to(pred_cls.device)  # noqa
+        def FloatTensor(x):
+            return torch.FloatTensor(x).to(pred_cls.device)  # noqa
+
+        def LongTensor(x):
+            return torch.LongTensor(x).to(pred_cls.device)  # noqa
 
         # Calculate offsets for each grid
-        grid_x = FloatTensor(torch.linspace(0, in_w - 1, in_w).repeat(in_h, 1).repeat(
-            bs * self.num_anchors, 1, 1).view(x.shape))
-        grid_y = FloatTensor(torch.linspace(0, in_h - 1, in_h).repeat(in_w, 1).t().repeat(
-            bs * self.num_anchors, 1, 1).view(y.shape))
+        grid_x = FloatTensor(
+            torch.linspace(0, in_w - 1, in_w)
+            .repeat(in_h, 1)
+            .repeat(bs * self.num_anchors, 1, 1)
+            .view(x.shape)
+        )
+        grid_y = FloatTensor(
+            torch.linspace(0, in_h - 1, in_h)
+            .repeat(in_w, 1)
+            .t()
+            .repeat(bs * self.num_anchors, 1, 1)
+            .view(y.shape)
+        )
         # Calculate anchor w, h
         anchor_w = FloatTensor(scaled_anchors).index_select(1, LongTensor([0]))
         anchor_h = FloatTensor(scaled_anchors).index_select(1, LongTensor([1]))
-        anchor_w = anchor_w.repeat(bs, 1).repeat(
-            1, 1, in_h * in_w).view(w.shape)
-        anchor_h = anchor_h.repeat(bs, 1).repeat(
-            1, 1, in_h * in_w).view(h.shape)
+        anchor_w = anchor_w.repeat(bs, 1).repeat(1, 1, in_h * in_w).view(w.shape)
+        anchor_h = anchor_h.repeat(bs, 1).repeat(1, 1, in_h * in_w).view(h.shape)
         # Add offset and scale with anchors
         pred_boxes = prediction[..., :4].clone()
 
@@ -419,20 +492,49 @@ def LongTensor(x): return torch.LongTensor(x).to(pred_cls.device)  # noqa
         # check if is training
         if targets is not None:
             #  build target
-            if self.build_target_type == 'v5':
-                mask, obj_mask, \
-                    tx, ty, tw, th, \
-                    tgt_scale, tcls, nlabel = self.get_target_yolov5(targets, pred_boxes, image_size,
-                                                                     in_w, in_h,
-                                                                     stride_w, stride_h,
-                                                                     self.ignore_threshold)
+            if self.build_target_type == "v5":
+                (
+                    mask,
+                    obj_mask,
+                    tx,
+                    ty,
+                    tw,
+                    th,
+                    tgt_scale,
+                    tcls,
+                    nlabel,
+                ) = self.get_target_yolov5(
+                    targets,
+                    pred_boxes,
+                    image_size,
+                    in_w,
+                    in_h,
+                    stride_w,
+                    stride_h,
+                    self.ignore_threshold,
+                )
             else:
-                mask, obj_mask, \
-                    tx, ty, tw, th, \
-                    tgt_scale, tcls, nlabel, num_fg = self.get_target(targets, pred_boxes, image_size,
-                                                                      in_w, in_h,
-                                                                      stride_w, stride_h,
-                                                                      self.ignore_threshold)
+                (
+                    mask,
+                    obj_mask,
+                    tx,
+                    ty,
+                    tw,
+                    th,
+                    tgt_scale,
+                    tcls,
+                    nlabel,
+                    num_fg,
+                ) = self.get_target(
+                    targets,
+                    pred_boxes,
+                    image_size,
+                    in_w,
+                    in_h,
+                    stride_w,
+                    stride_h,
+                    self.ignore_threshold,
+                )
 
             mask, obj_mask = mask.cuda(), obj_mask.cuda()
             tx, ty, tw, th = tx.cuda(), ty.cuda(), tw.cuda(), th.cuda()
@@ -442,13 +544,12 @@ def LongTensor(x): return torch.LongTensor(x).to(pred_cls.device)  # noqa
                 # loss_conf = (obj_mask * self.bce_obj(conf, mask)).sum() / bs
                 # mask is positive samples
                 loss_obj = self.bce_obj(conf, mask)
-                loss_obj = (obj_mask * loss_obj)
+                loss_obj = obj_mask * loss_obj
                 # loss_obj_neg = (loss_obj * (1-mask)*obj_mask)
                 # loss_obj = loss_obj_pos + loss_obj_neg
                 loss_obj = loss_obj.sum()
 
-                loss_cls = self.bce_cls(
-                    pred_cls[mask == 1], tcls[mask == 1]).sum()
+                loss_cls = self.bce_cls(pred_cls[mask == 1], tcls[mask == 1]).sum()
 
                 x = x.unsqueeze(-1)
                 y = y.unsqueeze(-1)
@@ -489,7 +590,7 @@ def LongTensor(x): return torch.LongTensor(x).to(pred_cls.device)  # noqa
 
                 if pboxes.shape[0] > 0:
                     lbox = ciou(pboxes, tboxes, sum=False).to(pboxes.device)
-                    lbox = tgt_scale*lbox.T
+                    lbox = tgt_scale * lbox.T
                     lbox = lbox.sum()
                 else:
                     lbox = torch.tensor(self.eps).to(pboxes.device)
@@ -503,17 +604,20 @@ def LongTensor(x): return torch.LongTensor(x).to(pred_cls.device)  # noqa
                 }
             else:
                 loss_conf = (obj_mask * self.bce_obj(conf, mask)).sum() / bs
-                loss_cls = self.bce_cls(
-                    pred_cls[mask == 1], tcls[mask == 1]).sum() / bs
-
-                loss_x = (mask * tgt_scale *
-                          self.bce_loss(x * mask, tx * mask)).sum() / bs
-                loss_y = (mask * tgt_scale *
-                          self.bce_loss(y * mask, ty * mask)).sum() / bs
-                loss_w = (mask * tgt_scale *
-                          self.l1_loss(w * mask, tw * mask)).sum() / bs
-                loss_h = (mask * tgt_scale *
-                          self.l1_loss(h * mask, th * mask)).sum() / bs
+                loss_cls = self.bce_cls(pred_cls[mask == 1], tcls[mask == 1]).sum() / bs
+
+                loss_x = (
+                    mask * tgt_scale * self.bce_loss(x * mask, tx * mask)
+                ).sum() / bs
+                loss_y = (
+                    mask * tgt_scale * self.bce_loss(y * mask, ty * mask)
+                ).sum() / bs
+                loss_w = (
+                    mask * tgt_scale * self.l1_loss(w * mask, tw * mask)
+                ).sum() / bs
+                loss_h = (
+                    mask * tgt_scale * self.l1_loss(h * mask, th * mask)
+                ).sum() / bs
 
                 # we are not using loss_x, loss_y here, just using a simple ciou loss
                 loss = {
@@ -529,14 +633,29 @@ def LongTensor(x): return torch.LongTensor(x).to(pred_cls.device)  # noqa
             conf = torch.sigmoid(conf)
             pred_cls = torch.sigmoid(pred_cls)
             # Results
-            output = torch.cat((pred_boxes.view(bs, -1, 4),
-                                conf.view(bs, -1, 1), pred_cls.view(bs, -1, self.num_classes)), -1)
+            output = torch.cat(
+                (
+                    pred_boxes.view(bs, -1, 4),
+                    conf.view(bs, -1, 1),
+                    pred_cls.view(bs, -1, self.num_classes),
+                ),
+                -1,
+            )
             return output.data
 
-    def get_target(self, target, pred_boxes, img_size,
-                   in_w, in_h, stride_w, stride_h, ignore_threshold):
-
-        def FloatTensor(x): return torch.FloatTensor(x).to(pred_boxes.device)  # noqa
+    def get_target(
+        self,
+        target,
+        pred_boxes,
+        img_size,
+        in_w,
+        in_h,
+        stride_w,
+        stride_h,
+        ignore_threshold,
+    ):
+        def FloatTensor(x):
+            return torch.FloatTensor(x).to(pred_boxes.device)  # noqa
 
         bs = target.size(0)
 
@@ -544,24 +663,22 @@ def FloatTensor(x): return torch.FloatTensor(x).to(pred_boxes.device)  # noqa
         # logger.info('stride_h, {}, stride_w: {}'.format(stride_h, stride_w))
         # logger.info('target shape: {}'.format(target.shape))
 
-        mask = torch.zeros(bs, self.num_anchors, in_h,
-                           in_w, requires_grad=False)
-        obj_mask = torch.ones(bs, self.num_anchors,
-                              in_h, in_w, requires_grad=False)
+        mask = torch.zeros(bs, self.num_anchors, in_h, in_w, requires_grad=False)
+        obj_mask = torch.ones(bs, self.num_anchors, in_h, in_w, requires_grad=False)
         tx = torch.zeros(bs, self.num_anchors, in_h, in_w, requires_grad=False)
         ty = torch.zeros(bs, self.num_anchors, in_h, in_w, requires_grad=False)
         tw = torch.zeros(bs, self.num_anchors, in_h, in_w, requires_grad=False)
         th = torch.zeros(bs, self.num_anchors, in_h, in_w, requires_grad=False)
-        tgt_scale = torch.zeros(bs, self.num_anchors,
-                                in_h, in_w, requires_grad=False)
+        tgt_scale = torch.zeros(bs, self.num_anchors, in_h, in_w, requires_grad=False)
 
-        tcls = torch.zeros(bs, self.num_anchors, in_h, in_w,
-                           self.num_classes, requires_grad=False)
+        tcls = torch.zeros(
+            bs, self.num_anchors, in_h, in_w, self.num_classes, requires_grad=False
+        )
         nlabel = (target.sum(dim=2) > 0).sum(dim=1)
-        gx_all = (target[:, :, 1] + target[:, :, 3]) / 2.0   # center x
+        gx_all = (target[:, :, 1] + target[:, :, 3]) / 2.0  # center x
         gy_all = (target[:, :, 2] + target[:, :, 4]) / 2.0  # center y
-        gw_all = (target[:, :, 3] - target[:, :, 1])        # width
-        gh_all = (target[:, :, 4] - target[:, :, 2])        # height
+        gw_all = target[:, :, 3] - target[:, :, 1]  # width
+        gh_all = target[:, :, 4] - target[:, :, 2]  # height
         gi_all = (gx_all / stride_w).to(torch.int16)
         gj_all = (gy_all / stride_h).to(torch.int16)
 
@@ -578,25 +695,25 @@ def FloatTensor(x): return torch.FloatTensor(x).to(pred_boxes.device)  # noqa
             truth_j = gj_all[b, :n]
 
             # change match strategy, by not using IoU maxium
-            anchor_ious_all = bboxes_iou(truth_box.cpu(),
-                                         self.ref_anchors.type_as(truth_box.cpu()), xyxy=False)
+            anchor_ious_all = bboxes_iou(
+                truth_box.cpu(), self.ref_anchors.type_as(truth_box.cpu()), xyxy=False
+            )
             best_n_all = np.argmax(anchor_ious_all, axis=1)
             # so we know which level it belongs to, 3 might be len(anchors)
             best_n = best_n_all % 3
-            best_n_mask = ((best_n_all // 3) == self.level)
+            best_n_mask = (best_n_all // 3) == self.level
 
             truth_box[:n, 0] = gx_all[b, :n]
             truth_box[:n, 1] = gy_all[b, :n]
             pred_box = pred_boxes[b]
 
-            pred_ious = bboxes_iou(pred_box.view(-1, 4),
-                                   truth_box, xyxy=False)
+            pred_ious = bboxes_iou(pred_box.view(-1, 4), truth_box, xyxy=False)
             # print(pred_box.shape)
             # pred_ious = bboxes_iou2(pred_box.view(-1, 4),
             #                         truth_box, x1y1x2y2=False, CIoU=True)
 
             pred_best_iou, _ = pred_ious.max(dim=1)
-            pred_best_iou = (pred_best_iou > ignore_threshold)
+            pred_best_iou = pred_best_iou > ignore_threshold
             pred_best_iou = pred_best_iou.view(pred_box.shape[:3])
             obj_mask[b] = ~pred_best_iou
 
@@ -621,23 +738,31 @@ def FloatTensor(x): return torch.FloatTensor(x).to(pred_boxes.device)  # noqa
                     tx[b, a, gj, gi] = gx / stride_w - gi
                     ty[b, a, gj, gi] = gy / stride_h - gj
                     # Width and height
-                    tw[b, a, gj, gi] = torch.log(
-                        gw / self.anchors[a][0] + 1e-16)
-                    th[b, a, gj, gi] = torch.log(
-                        gh / self.anchors[a][1] + 1e-16)
+                    tw[b, a, gj, gi] = torch.log(gw / self.anchors[a][0] + 1e-16)
+                    th[b, a, gj, gi] = torch.log(gh / self.anchors[a][1] + 1e-16)
 
-                    tgt_scale[b, a, gj, gi] = 2.0 - gw * \
-                        gh / (img_size[0] * img_size[1])
+                    tgt_scale[b, a, gj, gi] = 2.0 - gw * gh / (
+                        img_size[0] * img_size[1]
+                    )
                     # One-hot encoding of label
                     tcls[b, a, gj, gi, int(target[b, t, 0])] = 1
 
         num_fg = max(num_fg, 1)
         return mask, obj_mask, tx, ty, tw, th, tgt_scale, tcls, nlabel, num_fg
 
-    def get_target_yolov5(self, target, pred_boxes, img_size,
-                          in_w, in_h, stride_w, stride_h, ignore_threshold):
-
-        def FloatTensor(x): return torch.FloatTensor(x).to(pred_boxes.device)  # noqa
+    def get_target_yolov5(
+        self,
+        target,
+        pred_boxes,
+        img_size,
+        in_w,
+        in_h,
+        stride_w,
+        stride_h,
+        ignore_threshold,
+    ):
+        def FloatTensor(x):
+            return torch.FloatTensor(x).to(pred_boxes.device)  # noqa
 
         bs = target.size(0)
 
@@ -645,24 +770,22 @@ def FloatTensor(x): return torch.FloatTensor(x).to(pred_boxes.device)  # noqa
         # logger.info('stride_h, {}, stride_w: {}'.format(stride_h, stride_w))
         # logger.info('target shape: {}'.format(target.shape))
 
-        mask = torch.zeros(bs, self.num_anchors, in_h,
-                           in_w, requires_grad=False)
-        obj_mask = torch.ones(bs, self.num_anchors,
-                              in_h, in_w, requires_grad=False)
+        mask = torch.zeros(bs, self.num_anchors, in_h, in_w, requires_grad=False)
+        obj_mask = torch.ones(bs, self.num_anchors, in_h, in_w, requires_grad=False)
         tx = torch.zeros(bs, self.num_anchors, in_h, in_w, requires_grad=False)
         ty = torch.zeros(bs, self.num_anchors, in_h, in_w, requires_grad=False)
         tw = torch.zeros(bs, self.num_anchors, in_h, in_w, requires_grad=False)
         th = torch.zeros(bs, self.num_anchors, in_h, in_w, requires_grad=False)
-        tgt_scale = torch.zeros(bs, self.num_anchors,
-                                in_h, in_w, requires_grad=False)
+        tgt_scale = torch.zeros(bs, self.num_anchors, in_h, in_w, requires_grad=False)
 
-        tcls = torch.zeros(bs, self.num_anchors, in_h, in_w,
-                           self.num_classes, requires_grad=False)
+        tcls = torch.zeros(
+            bs, self.num_anchors, in_h, in_w, self.num_classes, requires_grad=False
+        )
         nlabel = (target.sum(dim=2) > 0).sum(dim=1)
-        gx_all = (target[:, :, 1] + target[:, :, 3]) / 2.0   # center x
+        gx_all = (target[:, :, 1] + target[:, :, 3]) / 2.0  # center x
         gy_all = (target[:, :, 2] + target[:, :, 4]) / 2.0  # center y
-        gw_all = (target[:, :, 3] - target[:, :, 1])        # width
-        gh_all = (target[:, :, 4] - target[:, :, 2])        # height
+        gw_all = target[:, :, 3] - target[:, :, 1]  # width
+        gh_all = target[:, :, 4] - target[:, :, 2]  # height
         gi_all = (gx_all / stride_w).to(torch.int16)
         gj_all = (gy_all / stride_h).to(torch.int16)
 
@@ -692,26 +815,32 @@ def FloatTensor(x): return torch.FloatTensor(x).to(pred_boxes.device)  # noqa
 
             # (todo) this strategy not work, find why
             anchor_indices_mask = get_matching_anchors(
-                truth_box.cpu(), self.ref_anchors.type_as(truth_box.cpu()), xyxy=False, anchor_ratio_thresh=self.anchor_ratio_thresh)
+                truth_box.cpu(),
+                self.ref_anchors.type_as(truth_box.cpu()),
+                xyxy=False,
+                anchor_ratio_thresh=self.anchor_ratio_thresh,
+            )
             # [[False, False, False, False,  True,  True, False, False, False],
             # [False, False, False, False, False, False, False,  True, False],
             # [False, False, False,  True,  True,  True, False, False, False],
             # [False,  True,  True,  True,  True, False, False, False, False]]  N x anchor_num
             # one box, might have more than one anchor in all 9 anchors
             # select mask of current level
-            anchor_indices_mask = anchor_indices_mask[:, self.level *
-                                                      self.num_anchors: self.level*self.num_anchors + self.num_anchors]
+            anchor_indices_mask = anchor_indices_mask[
+                :,
+                self.level * self.num_anchors : self.level * self.num_anchors
+                + self.num_anchors,
+            ]
             # now we get boxes anchor indices, of current level
 
             truth_box[:n, 0] = gx_all[b, :n]
             truth_box[:n, 1] = gy_all[b, :n]
             pred_box = pred_boxes[b]
 
-            pred_ious = bboxes_iou(pred_box.view(-1, 4),
-                                   truth_box, xyxy=False)
+            pred_ious = bboxes_iou(pred_box.view(-1, 4), truth_box, xyxy=False)
 
             pred_best_iou, _ = pred_ious.max(dim=1)
-            pred_best_iou = (pred_best_iou > ignore_threshold)
+            pred_best_iou = pred_best_iou > ignore_threshold
             pred_best_iou = pred_best_iou.view(pred_box.shape[:3])
             obj_mask[b] = ~pred_best_iou
 
@@ -741,13 +870,10 @@ def FloatTensor(x): return torch.FloatTensor(x).to(pred_boxes.device)  # noqa
                 tx[b, a, gj, gi] = gx / stride_w - gi
                 ty[b, a, gj, gi] = gy / stride_h - gj
                 # Width and height
-                tw[b, a, gj, gi] = torch.log(
-                    gw / self.anchors[a][0] + 1e-16)
-                th[b, a, gj, gi] = torch.log(
-                    gh / self.anchors[a][1] + 1e-16)
+                tw[b, a, gj, gi] = torch.log(gw / self.anchors[a][0] + 1e-16)
+                th[b, a, gj, gi] = torch.log(gh / self.anchors[a][1] + 1e-16)
 
-                tgt_scale[b, a, gj, gi] = 2.0 - gw * \
-                    gh / (img_size[0] * img_size[1])
+                tgt_scale[b, a, gj, gi] = 2.0 - gw * gh / (img_size[0] * img_size[1])
                 # One-hot encoding of label
                 tcls[b, a, gj, gi, int(target[b, t, 0])] = 1
 
@@ -770,7 +896,7 @@ def get_matching_anchors(gt_boxes, anchors, anchor_ratio_thresh=2.1, xyxy=True):
     # print('r', r)
     # print(r.shape)
     r = r.squeeze(1)
-    j = torch.max(r, 1. / r).max(-1)[0] < anchor_ratio_thresh
+    j = torch.max(r, 1.0 / r).max(-1)[0] < anchor_ratio_thresh
     # print('j shape: ', j.shape)
     # j can be used for best_n_all
     return j
diff --git a/yolov7/modeling/meta_arch/yolov7p.py b/yolov7/modeling/meta_arch/yolov7p.py
old mode 100755
new mode 100644
index 6efd25b..64e1339
--- a/yolov7/modeling/meta_arch/yolov7p.py
+++ b/yolov7/modeling/meta_arch/yolov7p.py
@@ -27,9 +27,6 @@
 
 from alfred.dl.metrics.iou_loss import ciou_loss, ciou
 from alfred.utils.log import logger
-
-from nb.torch.blocks.head_blocks import SPP, PANet
-
 from yolov7.modeling.neck.yolo_fpn import YOLOFPN
 from ..neck.yolo_pafpn import YOLOPAFPN
 from yolov7.utils.boxes import postprocess, bboxes_iou
diff --git a/yolov7/modeling/meta_arch/yolox.py b/yolov7/modeling/meta_arch/yolox.py
old mode 100755
new mode 100644
diff --git a/yolov7/modeling/meta_arch/yolox_kpts.py b/yolov7/modeling/meta_arch/yolox_kpts.py
new file mode 100644
index 0000000..4efd38b
--- /dev/null
+++ b/yolov7/modeling/meta_arch/yolox_kpts.py
@@ -0,0 +1,259 @@
+
+
+import torch.nn as nn
+import torch
+from detectron2.modeling import META_ARCH_REGISTRY
+from detectron2.modeling.backbone import build_backbone
+
+from detectron2.structures import Boxes, ImageList, Instances, image_list
+from detectron2.utils import comm
+from detectron2.utils.logger import log_first_n
+from detectron2.modeling.postprocessing import detector_postprocess
+
+import torch.distributed as dist
+
+import numpy as np
+import time
+import logging
+from alfred.utils.log import logger
+
+from ..head.yolox_head import YOLOXHead
+from ..head.yolox_kpts_head import YOLOXHeadKPTS
+from ..neck.yolo_pafpn import YOLOPAFPN
+
+from yolov7.utils.boxes import postprocess, BoxModeMy, postprocess_yolox_kpts
+
+
+"""
+Code for support construct YOLOX model
+You need specify YOLOX as your ARCH
+and using YOLOX backbone.
+This ARCH using AnchorFree manner to do training
+
+"""
+
+
+@META_ARCH_REGISTRY.register()
+class YOLOX(nn.Module):
+
+    def __init__(self, cfg):
+        super(YOLOX, self).__init__()
+        # configurations
+        self.device = torch.device(cfg.MODEL.DEVICE)
+        self.conf_threshold = cfg.MODEL.YOLO.CONF_THRESHOLD
+        self.nms_threshold = cfg.MODEL.YOLO.NMS_THRESHOLD
+        self.nms_type = cfg.MODEL.NMS_TYPE
+        self.loss_type = cfg.MODEL.YOLO.LOSS_TYPE
+
+        # l1 loss will open at last 15 epochs
+        self.use_l1 = False
+
+        self.depth_mul = cfg.MODEL.YOLO.DEPTH_MUL
+        self.width_mul = cfg.MODEL.YOLO.WIDTH_MUL
+
+        self.iter = 0
+        self.max_iter = cfg.SOLVER.MAX_ITER
+        self.enable_l1_loss_at = cfg.INPUT.MOSAIC_AND_MIXUP.DISABLE_AT_ITER
+        self.max_boxes_num = cfg.MODEL.YOLO.MAX_BOXES_NUM
+        self.in_features = cfg.MODEL.YOLO.IN_FEATURES
+
+        # self.num_classes = cfg.MODEL.YOLO.CLASSES
+        self.num_classes = 2 # only for person
+        self.keypoints_num = cfg.MODEL.YOLO.KEYPOINTS_NUM
+
+        self.backbone = build_backbone(cfg)
+        backbone_shape = self.backbone.output_shape()
+        self.size_divisibility = 32 if self.backbone.size_divisibility == 0 else self.backbone.size_divisibility
+        backbone_shape = [backbone_shape[i].channels for i in self.in_features]
+        logger.info('backboneshape: {}, size_divisibility: {}'.format(
+            backbone_shape, self.size_divisibility))
+
+        # don't specific in_channels, let it calculate
+        self.neck = YOLOPAFPN(
+            depth=self.depth_mul, width=self.width_mul, in_features=self.in_features)
+        # self.head = YOLOXHead(self.num_classes, width=self.width_mul)
+        self.head = YOLOXHeadKPTS(self.num_classes, width=self.width_mul, num_kpts=self.keypoints_num)
+
+        pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(
+            3, 1, 1)
+        pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(
+            3, 1, 1)
+        self.padded_value = cfg.MODEL.PADDED_VALUE
+        self.normalizer = lambda x: (x / 255. - pixel_mean) / pixel_std
+        self.to(self.device)
+        self.onnx_export = False
+        self.onnx_vis = False
+
+        self.apply(self._init_model)
+        self.head.initialize_biases(1e-2)
+
+    @staticmethod
+    def _init_model(M):
+        for m in M.modules():
+            if isinstance(m, nn.BatchNorm2d):
+                m.eps = 1e-3
+                m.momentum = 0.03
+
+    def update_iter(self, i):
+        self.iter = i
+
+    def preprocess_image(self, batched_inputs, training):
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        bs = len(images)
+        # images = [self.normalizer(x) for x in images]
+        images = [x.type(torch.float) for x in images]
+
+        images = ImageList.from_tensors(
+            images, size_divisibility=self.size_divisibility, pad_value=self.padded_value)
+        # logger.info('images ori shape: {}'.format(images.tensor.shape))
+
+        if training and self.iter > self.enable_l1_loss_at and not self.use_l1:
+            meg = torch.BoolTensor(1).to(self.device)
+            if comm.is_main_process():
+                logger.info(
+                    '[master] enable l1 loss now at iter: {}'.format(self.iter))
+                # enable l1 loss at last 50000 iterations
+                meg.fill_(True)
+
+            if comm.get_world_size() > 1:
+                comm.synchronize()
+                if comm.is_main_process():
+                    dist.broadcast(meg, 0)
+            self.head.use_l1 = meg.item()
+            self.use_l1 = meg.item()
+            comm.synchronize()
+            logger.info(
+                'check head l1: {}'.format(self.head.use_l1))
+
+        if training:
+            if "instances" in batched_inputs[0]:
+                gt_instances = [
+                    x["instances"].to(self.device) for x in batched_inputs
+                ]
+            elif "targets" in batched_inputs[0]:
+                log_first_n(
+                    logging.WARN,
+                    "'targets' in the model inputs is now renamed to 'instances'!",
+                    n=10)
+                gt_instances = [
+                    x["targets"].to(self.device) for x in batched_inputs
+                ]
+            else:
+                gt_instances = None
+
+            if gt_instances:
+                for i in gt_instances:
+                    i.gt_boxes.tensor = BoxModeMy.convert(
+                        i.gt_boxes.tensor, from_mode=BoxModeMy.XYXY_ABS, to_mode=BoxModeMy.XYWH_ABS)
+
+            targets = [
+                torch.cat(
+                    # YOLOX using [cls, box], box is cx cy w h
+                    [instance.gt_classes.float().unsqueeze(-1), instance.gt_boxes.tensor], dim=-1
+                    # [instance.gt_boxes.tensor, instance.gt_classes.float().unsqueeze(-1), ], dim=-1
+                )
+                for instance in gt_instances
+            ]
+
+            labels = torch.zeros((bs, self.max_boxes_num, 5))
+            for i, target in enumerate(targets):
+                if target.shape[0] > self.max_boxes_num:
+                    target = target[:self.max_boxes_num, :]
+                labels[i][:target.shape[0]] = target
+        else:
+            labels = None
+
+        # self.iter += 1
+        return images, labels, images.image_sizes
+
+    def preprocess_input(self, x):
+        x = x.permute(0, 3, 1, 2)
+        # x = F.interpolate(x, size=(640, 640))
+        # x = F.interpolate(x, size=(512, 960))
+        # x = self.normalizer(x)
+        return x
+
+    def forward(self, batched_inputs):
+        if self.onnx_export:
+            logger.info('[WARN] exporting onnx...')
+            assert isinstance(batched_inputs, torch.Tensor) or isinstance(
+                batched_inputs, list), 'onnx export, batched_inputs only needs image tensor'
+            x = self.preprocess_input(batched_inputs)
+            # batched_inputs = batched_inputs.permute(0, 3, 1, 2)
+            image_ori_sizes = [batched_inputs.shape[1:3]]
+        else:
+            images, labels, image_ori_sizes = self.preprocess_image(
+                batched_inputs, self.training)
+            if labels is not None:
+                labels = labels.to(images.device)
+
+            x = images.tensor
+            img_size = x.shape[-2:]
+            # logger.info('img size: {}'.format(img_size))
+        
+        if self.eval:
+            t0 = time.time()
+
+        out_features = self.backbone(x)
+        fpn_outs = self.neck(out_features)  # 512, 1024, 2048, s, m, l
+
+        if self.training:
+            # print(labels)
+            loss, iou_loss, conf_loss, cls_loss, l1_loss, num_fg = self.head(
+                fpn_outs, labels, x
+            )
+
+            outputs = {
+                "total_loss": loss,
+                "iou_loss": iou_loss,
+                "conf_loss": conf_loss,
+                "cls_loss": cls_loss,
+            }
+            if self.use_l1:
+                outputs['l1_loss'] = l1_loss
+            return outputs
+        else:
+            if self.onnx_export:
+                if not self.onnx_vis:
+                    # self.head.decode_in_inference = False
+                    self.head.decode_in_inference = True
+                    self.head.onnx_export = True
+                    # we wrap box decode into onnx model as well
+                    outputs = self.head(fpn_outs)
+                    return outputs
+                else:
+                    self.head.decode_in_inference = True
+                    outputs = self.head(fpn_outs)
+                    detections = postprocess_yolox_kpts(
+                        outputs, self.num_classes, self.conf_threshold, self.nms_threshold)
+                    return detections
+            else:
+                outputs = self.head(fpn_outs)
+
+                t1 = time.time()
+
+                detections = postprocess_yolox_kpts(
+                    outputs, self.num_classes, self.conf_threshold, self.nms_threshold)
+
+                results = []
+                for idx, out in enumerate(detections):
+                    if out is None:
+                        out = x.new_zeros((0, 7))
+                    image_size = image_ori_sizes[idx]
+                    result = Instances(image_size)
+                    result.pred_boxes = Boxes(out[:, :4])
+                    result.scores = out[:, 5] * out[:, 4]
+                    result.pred_classes = out[:, 6]
+                    result.pred_keypoints = out[:, 6:]
+                    # TODO: add keypoints prediction
+                    results.append(result)
+
+                processed_results = []
+                for results_per_image, input_per_image, image_size in zip(
+                        results, batched_inputs, images.image_sizes):
+                    height = input_per_image.get("height", image_size[0])
+                    width = input_per_image.get("width", image_size[1])
+                    r = detector_postprocess(results_per_image, height, width)
+                    processed_results.append({"instances": r})
+                # return processed_results, t1 - t0
+                return processed_results
diff --git a/yolov7/modeling/neck/bifpn.py b/yolov7/modeling/neck/bifpn.py
old mode 100755
new mode 100644
diff --git a/yolov7/modeling/neck/ppyolo_pan.py b/yolov7/modeling/neck/ppyolo_pan.py
old mode 100755
new mode 100644
diff --git a/yolov7/modeling/neck/reppan.py b/yolov7/modeling/neck/reppan.py
new file mode 100644
index 0000000..10eb1a1
--- /dev/null
+++ b/yolov7/modeling/neck/reppan.py
@@ -0,0 +1,105 @@
+import torch
+from torch import nn
+from ..backbone.efficientrep import RepBlock, SimConv, Transpose
+
+
+class RepPANNeck(nn.Module):
+    """RepPANNeck Module
+    EfficientRep is the default backbone of this model.
+    RepPANNeck has the balance of feature fusion ability and hardware efficiency.
+    """
+
+    def __init__(self, channels_list=None, num_repeats=None, in_features=None):
+        super().__init__()
+
+        assert channels_list is not None
+        assert num_repeats is not None
+
+        self.in_features = in_features
+
+        self.Rep_p4 = RepBlock(
+            in_channels=channels_list[3] + channels_list[5],
+            out_channels=channels_list[5],
+            n=num_repeats[5],
+        )
+
+        self.Rep_p3 = RepBlock(
+            in_channels=channels_list[2] + channels_list[6],
+            out_channels=channels_list[6],
+            n=num_repeats[6],
+        )
+
+        self.Rep_n3 = RepBlock(
+            in_channels=channels_list[6] + channels_list[7],
+            out_channels=channels_list[8],
+            n=num_repeats[7],
+        )
+
+        self.Rep_n4 = RepBlock(
+            in_channels=channels_list[5] + channels_list[9],
+            out_channels=channels_list[10],
+            n=num_repeats[8],
+        )
+
+        self.reduce_layer0 = SimConv(
+            in_channels=channels_list[4],
+            out_channels=channels_list[5],
+            kernel_size=1,
+            stride=1,
+        )
+
+        self.upsample0 = Transpose(
+            in_channels=channels_list[5],
+            out_channels=channels_list[5],
+        )
+
+        self.reduce_layer1 = SimConv(
+            in_channels=channels_list[5],
+            out_channels=channels_list[6],
+            kernel_size=1,
+            stride=1,
+        )
+
+        self.upsample1 = Transpose(
+            in_channels=channels_list[6], out_channels=channels_list[6]
+        )
+
+        self.downsample2 = SimConv(
+            in_channels=channels_list[6],
+            out_channels=channels_list[7],
+            kernel_size=3,
+            stride=2,
+        )
+
+        self.downsample1 = SimConv(
+            in_channels=channels_list[8],
+            out_channels=channels_list[9],
+            kernel_size=3,
+            stride=2,
+        )
+
+    def forward(self, input):
+        input = [input[f] for f in self.in_features]
+        (x2, x1, x0) = input
+
+        fpn_out0 = self.reduce_layer0(x0)
+        upsample_feat0 = self.upsample0(fpn_out0)
+        f_concat_layer0 = torch.cat([upsample_feat0, x1], 1)
+        f_out0 = self.Rep_p4(f_concat_layer0)
+
+        fpn_out1 = self.reduce_layer1(f_out0)
+        upsample_feat1 = self.upsample1(fpn_out1)
+        f_concat_layer1 = torch.cat([upsample_feat1, x2], 1)
+        pan_out2 = self.Rep_p3(f_concat_layer1)
+
+        down_feat1 = self.downsample2(pan_out2)
+        p_concat_layer1 = torch.cat([down_feat1, fpn_out1], 1)
+        pan_out1 = self.Rep_n3(p_concat_layer1)
+
+        down_feat0 = self.downsample1(pan_out1)
+        p_concat_layer2 = torch.cat([down_feat0, fpn_out0], 1)
+        pan_out0 = self.Rep_n4(p_concat_layer2)
+
+        outputs = [pan_out2, pan_out1, pan_out0]
+
+        return outputs
diff --git a/yolov7/modeling/neck/yolo_fpn.py b/yolov7/modeling/neck/yolo_fpn.py
old mode 100755
new mode 100644
diff --git a/yolov7/modeling/neck/yolo_pafpn.py b/yolov7/modeling/neck/yolo_pafpn.py
old mode 100755
new mode 100644
diff --git a/yolov7/modeling/transcoders/__init__.py b/yolov7/modeling/transcoders/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/yolov7/modeling/transcoders/decoder_sparseinst.py b/yolov7/modeling/transcoders/decoder_sparseinst.py
new file mode 100644
index 0000000..7059022
--- /dev/null
+++ b/yolov7/modeling/transcoders/decoder_sparseinst.py
@@ -0,0 +1,255 @@
+# Copyright (c) Tianheng Cheng and its affiliates. All Rights Reserved
+
+import math
+import torch
+import torch.nn as nn
+from torch.nn import init
+import torch.nn.functional as F
+
+from fvcore.nn.weight_init import c2_msra_fill, c2_xavier_fill
+
+from detectron2.utils.registry import Registry
+from detectron2.layers import Conv2d
+
+SPARSE_INST_DECODER_REGISTRY = Registry("SPARSE_INST_DECODER")
+SPARSE_INST_DECODER_REGISTRY.__doc__ = "registry for SparseInst decoder"
+
+
+def _make_stack_3x3_convs(num_convs, in_channels, out_channels):
+    convs = []
+    for _ in range(num_convs):
+        convs.append(Conv2d(in_channels, out_channels, 3, padding=1))
+        convs.append(nn.ReLU(True))
+        in_channels = out_channels
+    return nn.Sequential(*convs)
+
+
+class InstanceBranch(nn.Module):
+    def __init__(self, cfg, in_channels):
+        super().__init__()
+        # norm = cfg.MODEL.SPARSE_INST.DECODER.NORM
+        dim = cfg.MODEL.SPARSE_INST.DECODER.INST.DIM
+        num_convs = cfg.MODEL.SPARSE_INST.DECODER.INST.CONVS
+        num_masks = cfg.MODEL.SPARSE_INST.DECODER.NUM_MASKS
+        kernel_dim = cfg.MODEL.SPARSE_INST.DECODER.KERNEL_DIM
+        self.num_classes = cfg.MODEL.SPARSE_INST.DECODER.NUM_CLASSES
+
+        self.inst_convs = _make_stack_3x3_convs(num_convs, in_channels, dim)
+        # iam prediction, a simple conv
+        self.iam_conv = nn.Conv2d(dim, num_masks, 3, padding=1)
+
+        # outputs
+        self.cls_score = nn.Linear(dim, self.num_classes)
+        self.mask_kernel = nn.Linear(dim, kernel_dim)
+        self.objectness = nn.Linear(dim, 1)
+
+        self.prior_prob = 0.01
+        self._init_weights()
+
+    def _init_weights(self):
+        for m in self.inst_convs.modules():
+            if isinstance(m, nn.Conv2d):
+                c2_msra_fill(m)
+        bias_value = -math.log((1 - self.prior_prob) / self.prior_prob)
+        for module in [self.iam_conv, self.cls_score]:
+            init.constant_(module.bias, bias_value)
+        init.normal_(self.iam_conv.weight, std=0.01)
+        init.normal_(self.cls_score.weight, std=0.01)
+
+        init.normal_(self.mask_kernel.weight, std=0.01)
+        init.constant_(self.mask_kernel.bias, 0.0)
+
+    def forward(self, features):
+        # instance features (x4 convs)
+        features = self.inst_convs(features)
+        # predict instance activation maps
+        iam = self.iam_conv(features)
+        iam_prob = iam.sigmoid()
+
+        B, N = iam_prob.shape[:2]
+        C = features.size(1)
+        # BxNxHxW -> BxNx(HW)
+        iam_prob = iam_prob.view(B, N, -1)
+        # aggregate features: BxCxHxW -> Bx(HW)xC
+        inst_features = torch.bmm(iam_prob, features.view(B, C, -1).permute(0, 2, 1))
+        normalizer = iam_prob.sum(-1).clamp(min=1e-6)
+        inst_features = inst_features / normalizer[:, :, None]
+        # predict classification & segmentation kernel & objectness
+        pred_logits = self.cls_score(inst_features)
+        pred_kernel = self.mask_kernel(inst_features)
+        pred_scores = self.objectness(inst_features)
+        return pred_logits, pred_kernel, pred_scores, iam
+
+
+class MaskBranch(nn.Module):
+    def __init__(self, cfg, in_channels):
+        super().__init__()
+        dim = cfg.MODEL.SPARSE_INST.DECODER.MASK.DIM
+        num_convs = cfg.MODEL.SPARSE_INST.DECODER.MASK.CONVS
+        kernel_dim = cfg.MODEL.SPARSE_INST.DECODER.KERNEL_DIM
+        self.mask_convs = _make_stack_3x3_convs(num_convs, in_channels, dim)
+        self.projection = nn.Conv2d(dim, kernel_dim, kernel_size=1)
+        self._init_weights()
+
+    def _init_weights(self):
+        for m in self.mask_convs.modules():
+            if isinstance(m, nn.Conv2d):
+                c2_msra_fill(m)
+        c2_msra_fill(self.projection)
+
+    def forward(self, features):
+        # mask features (x4 convs)
+        features = self.mask_convs(features)
+        return self.projection(features)
+
+
+@SPARSE_INST_DECODER_REGISTRY.register()
+class BaseIAMDecoder(nn.Module):
+    def __init__(self, cfg):
+        super().__init__()
+        # add 2 for coordinates
+        in_channels = cfg.MODEL.SPARSE_INST.ENCODER.NUM_CHANNELS + 2
+
+        self.scale_factor = cfg.MODEL.SPARSE_INST.DECODER.SCALE_FACTOR
+        self.output_iam = cfg.MODEL.SPARSE_INST.DECODER.OUTPUT_IAM
+
+        self.inst_branch = InstanceBranch(cfg, in_channels)
+        self.mask_branch = MaskBranch(cfg, in_channels)
+
+    @torch.no_grad()
+    def compute_coordinates(self, x):
+        h, w = x.size(2), x.size(3)
+        y_loc = torch.linspace(-1, 1, h, device=x.device)
+        x_loc = torch.linspace(-1, 1, w, device=x.device)
+        y_loc, x_loc = torch.meshgrid(y_loc, x_loc)
+        y_loc = y_loc.expand([x.shape[0], 1, -1, -1])
+        x_loc = x_loc.expand([x.shape[0], 1, -1, -1])
+        locations = torch.cat([x_loc, y_loc], 1)
+        return locations.to(x)
+
+    def forward(self, features):
+        coord_features = self.compute_coordinates(features)
+        features = torch.cat([coord_features, features], dim=1)
+        pred_logits, pred_kernel, pred_scores, iam = self.inst_branch(features)
+        mask_features = self.mask_branch(features)
+
+        N = pred_kernel.shape[1]
+        # mask_features: BxCxHxW
+        if torch.onnx.is_in_onnx_export():
+            sh = mask_features.shape
+            pred_masks = torch.bmm(pred_kernel, mask_features.view(sh[0], sh[1], sh[2] * sh[3])).view(sh[0], N, sh[2], sh[3])
+        else:
+            B, C, H, W = mask_features.shape
+            pred_masks = torch.bmm(pred_kernel, mask_features.view(B, C, H * W)).view(
+                B, N, H, W
+            )
+
+        pred_masks = F.interpolate(
+            pred_masks,
+            scale_factor=self.scale_factor,
+            mode="bilinear",
+            align_corners=False,
+        )
+
+        output = {
+            "pred_logits": pred_logits,
+            "pred_masks": pred_masks,
+            "pred_scores": pred_scores,
+        }
+
+        if self.output_iam:
+            iam = F.interpolate(
+                iam,
+                scale_factor=self.scale_factor,
+                mode="bilinear",
+                align_corners=False,
+            )
+            output["pred_iam"] = iam
+
+        return output
+
+
+class GroupInstanceBranch(nn.Module):
+    def __init__(self, cfg, in_channels):
+        super().__init__()
+        dim = cfg.MODEL.SPARSE_INST.DECODER.INST.DIM
+        num_convs = cfg.MODEL.SPARSE_INST.DECODER.INST.CONVS
+        num_masks = cfg.MODEL.SPARSE_INST.DECODER.NUM_MASKS
+        kernel_dim = cfg.MODEL.SPARSE_INST.DECODER.KERNEL_DIM
+        self.num_groups = cfg.MODEL.SPARSE_INST.DECODER.GROUPS
+        self.num_classes = cfg.MODEL.SPARSE_INST.DECODER.NUM_CLASSES
+
+        self.inst_convs = _make_stack_3x3_convs(num_convs, in_channels, dim)
+        # iam prediction, a group conv
+        expand_dim = dim * self.num_groups
+        self.iam_conv = nn.Conv2d(
+            dim, num_masks * self.num_groups, 3, padding=1, groups=self.num_groups
+        )
+        # outputs
+        self.fc = nn.Linear(expand_dim, expand_dim)
+
+        self.cls_score = nn.Linear(expand_dim, self.num_classes)
+        self.mask_kernel = nn.Linear(expand_dim, kernel_dim)
+        self.objectness = nn.Linear(expand_dim, 1)
+
+        self.prior_prob = 0.01
+        self._init_weights()
+
+    def _init_weights(self):
+        for m in self.inst_convs.modules():
+            if isinstance(m, nn.Conv2d):
+                c2_msra_fill(m)
+        bias_value = -math.log((1 - self.prior_prob) / self.prior_prob)
+        for module in [self.iam_conv, self.cls_score]:
+            init.constant_(module.bias, bias_value)
+        init.normal_(self.iam_conv.weight, std=0.01)
+        init.normal_(self.cls_score.weight, std=0.01)
+
+        init.normal_(self.mask_kernel.weight, std=0.01)
+        init.constant_(self.mask_kernel.bias, 0.0)
+        c2_xavier_fill(self.fc)
+
+    def forward(self, features):
+        # instance features (x4 convs)
+        features = self.inst_convs(features)
+        # predict instance activation maps
+        iam = self.iam_conv(features)
+        iam_prob = iam.sigmoid()
+
+        B, N = iam_prob.shape[:2]
+        C = features.size(1)
+        # BxNxHxW -> BxNx(HW)
+        iam_prob = iam_prob.view(B, N, -1)
+        # aggregate features: BxCxHxW -> Bx(HW)xC
+        inst_features = torch.bmm(iam_prob, features.view(B, C, -1).permute(0, 2, 1))
+        normalizer = iam_prob.sum(-1).clamp(min=1e-6, max=1e5)
+        # print(normalizer)
+        inst_features = inst_features / normalizer[:, :, None]
+
+        # d4 = torch.div(N, 4, rounding_mode='floor') # can't use this for onnx tracable
+        d4 = N // 4
+        inst_features = (
+            inst_features.reshape(B, 4, d4, -1)
+            .transpose(1, 2)
+            .reshape(B, d4, -1)
+        )
+
+        inst_features = F.relu_(self.fc(inst_features))
+        # predict classification & segmentation kernel & objectness
+        pred_logits = self.cls_score(inst_features)
+        pred_kernel = self.mask_kernel(inst_features)
+        pred_scores = self.objectness(inst_features)
+        return pred_logits, pred_kernel, pred_scores, iam
+
+
+@SPARSE_INST_DECODER_REGISTRY.register()
+class GroupIAMDecoder(BaseIAMDecoder):
+    def __init__(self, cfg):
+        super().__init__(cfg)
+        in_channels = cfg.MODEL.SPARSE_INST.ENCODER.NUM_CHANNELS + 2
+        self.inst_branch = GroupInstanceBranch(cfg, in_channels)
+
+
+def build_sparse_inst_decoder(cfg):
+    name = cfg.MODEL.SPARSE_INST.DECODER.NAME
+    return SPARSE_INST_DECODER_REGISTRY.get(name)(cfg)
diff --git a/yolov7/modeling/transcoders/encoder_sparseinst.py b/yolov7/modeling/transcoders/encoder_sparseinst.py
new file mode 100644
index 0000000..d63e6d0
--- /dev/null
+++ b/yolov7/modeling/transcoders/encoder_sparseinst.py
@@ -0,0 +1,132 @@
+# Copyright (c) Tianheng Cheng and its affiliates. All Rights Reserved
+
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from fvcore.nn.weight_init import c2_msra_fill, c2_xavier_fill
+
+from detectron2.utils.registry import Registry
+from detectron2.layers import Conv2d
+from alfred.utils.log import logger
+
+SPARSE_INST_ENCODER_REGISTRY = Registry("SPARSE_INST_ENCODER")
+SPARSE_INST_ENCODER_REGISTRY.__doc__ = "registry for SparseInst decoder"
+
+
+class MyAdaptiveAvgPool2d(nn.Module):
+    def __init__(self, sz=None):
+        super().__init__()
+        self.sz = sz
+
+    def forward(self, x):
+        inp_size = x.size()
+        kernel_width, kernel_height = inp_size[2], inp_size[3]
+        if self.sz is not None:
+            if isinstance(self.sz, int):
+                kernel_width = math.ceil(inp_size[2] / self.sz)
+                kernel_height = math.ceil(inp_size[3] / self.sz)
+            elif isinstance(self.sz, list) or isinstance(self.sz, tuple):
+                assert len(self.sz) == 2
+                kernel_width = math.ceil(inp_size[2] / self.sz[0])
+                kernel_height = math.ceil(inp_size[3] / self.sz[1])
+        if torch.is_tensor(kernel_width):
+            kernel_width = kernel_width.item()
+            kernel_height = kernel_height.item()
+        return F.avg_pool2d(
+            input=x, ceil_mode=False, kernel_size=(kernel_width, kernel_height)
+        )
+
+
+class PyramidPoolingModule(nn.Module):
+    def __init__(self, in_channels, channels=512, sizes=(1, 2, 3, 6)):
+        super().__init__()
+        self.stages = []
+        self.stages = nn.ModuleList(
+            [self._make_stage(in_channels, channels, size) for size in sizes]
+        )
+        self.bottleneck = Conv2d(in_channels + len(sizes) * channels, in_channels, 1)
+
+    def _make_stage(self, features, out_features, size):
+        # prior = nn.AdaptiveAvgPool2d(output_size=(size, size))
+        prior = MyAdaptiveAvgPool2d((size, size))
+        conv = Conv2d(features, out_features, 1)
+        return nn.Sequential(prior, conv)
+
+    def forward(self, feats):
+        h, w = feats.size(2), feats.size(3)
+        priors = [
+            F.interpolate(
+                input=F.relu_(stage(feats)),
+                size=(h, w),
+                mode="bilinear",
+                align_corners=False,
+            )
+            for stage in self.stages
+        ] + [feats]
+        out = F.relu_(self.bottleneck(torch.cat(priors, 1)))
+        return out
+
+
+@SPARSE_INST_ENCODER_REGISTRY.register()
+class InstanceContextEncoder(nn.Module):
+    """
+    Instance Context Encoder
+    1. construct feature pyramids from ResNet
+    2. enlarge receptive fields (ppm)
+    3. multi-scale fusion
+    """
+
+    def __init__(self, cfg, input_shape):
+        super().__init__()
+        self.num_channels = cfg.MODEL.SPARSE_INST.ENCODER.NUM_CHANNELS
+        self.in_features = cfg.MODEL.SPARSE_INST.ENCODER.IN_FEATURES
+        # self.norm = cfg.MODEL.SPARSE_INST.ENCODER.NORM
+        # depthwise = cfg.MODEL.SPARSE_INST.ENCODER.DEPTHWISE
+        self.in_channels = [input_shape[f].channels for f in self.in_features]
+        # self.using_bias = self.norm == ""
+        fpn_laterals = []
+        fpn_outputs = []
+        # groups = self.num_channels if depthwise else 1
+        for in_channel in reversed(self.in_channels):
+            lateral_conv = Conv2d(in_channel, self.num_channels, 1)
+            output_conv = Conv2d(self.num_channels, self.num_channels, 3, padding=1)
+            c2_xavier_fill(lateral_conv)
+            c2_xavier_fill(output_conv)
+            fpn_laterals.append(lateral_conv)
+            fpn_outputs.append(output_conv)
+        self.fpn_laterals = nn.ModuleList(fpn_laterals)
+        self.fpn_outputs = nn.ModuleList(fpn_outputs)
+        # ppm
+        self.ppm = PyramidPoolingModule(self.num_channels, self.num_channels // 4)
+        # final fusion
+        self.fusion = nn.Conv2d(self.num_channels * 3, self.num_channels, 1)
+        c2_msra_fill(self.fusion)
+
+    def forward(self, features):
+        features = [features[f] for f in self.in_features]
+        features = features[::-1]
+        prev_features = self.ppm(self.fpn_laterals[0](features[0]))
+        outputs = [self.fpn_outputs[0](prev_features)]
+        for feature, lat_conv, output_conv in zip(
+            features[1:], self.fpn_laterals[1:], self.fpn_outputs[1:]
+        ):
+            lat_features = lat_conv(feature)
+            top_down_features = F.interpolate(
+                prev_features, scale_factor=2.0, mode="nearest"
+            )
+            prev_features = lat_features + top_down_features
+            outputs.insert(0, output_conv(prev_features))
+        size = outputs[0].shape[2:]
+        features = [outputs[0]] + [
+            F.interpolate(x, size, mode="bilinear", align_corners=False)
+            for x in outputs[1:]
+        ]
+        features = self.fusion(torch.cat(features, dim=1))
+        return features
+
+
+def build_sparse_inst_encoder(cfg, input_shape):
+    name = cfg.MODEL.SPARSE_INST.ENCODER.NAME
+    return SPARSE_INST_ENCODER_REGISTRY.get(name)(cfg, input_shape)
diff --git a/yolov7/optimizer/__init__.py b/yolov7/optimizer/__init__.py
new file mode 100644
index 0000000..025f043
--- /dev/null
+++ b/yolov7/optimizer/__init__.py
@@ -0,0 +1,5 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+from .build import build_optimizer_mapper
+
+__all__ = ["build_optimizer_mapper"]
diff --git a/yolov7/optimizer/build.py b/yolov7/optimizer/build.py
new file mode 100644
index 0000000..5208275
--- /dev/null
+++ b/yolov7/optimizer/build.py
@@ -0,0 +1,311 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import itertools
+import logging
+from typing import Any, Dict, List, Optional, Union
+
+import torch
+from ..utils.qat_utils import iterate_module_named_parameters
+from detectron2.solver.build import (
+    maybe_add_gradient_clipping as d2_maybe_add_gradient_clipping,
+    reduce_param_groups,
+)
+from detectron2.utils.registry import Registry
+
+
+D2GO_OPTIM_MAPPER_REGISTRY = Registry("D2GO_OPTIM_MAPPER")
+
+logger = logging.getLogger(__name__)
+
+
+OptimizerModelsType = Union[torch.nn.Module, torch.nn.parallel.DistributedDataParallel]
+
+
+def get_optimizer_param_groups(model: OptimizerModelsType, cfg):
+    """
+    Get override optimizer parameter groups
+       * Get all default parameters
+       # Get parameter groups for normalization and bias
+       # Get parameter groups from model if the model implements `get_optimizer_param_groups()`
+    Parameters appear later will override parameters appear earlier
+    """
+    # get all parameters that requires gradient
+    params = get_optimizer_param_groups_default(model)
+
+    # parameter groups for lr
+    params += get_optimizer_param_groups_lr(
+        model,
+        base_lr=cfg.SOLVER.BASE_LR,
+        bias_lr_factor=cfg.SOLVER.BIAS_LR_FACTOR,
+        lr_multipliers_overwrite=_merge_dict(cfg.SOLVER.LR_MULTIPLIER_OVERWRITE),
+    )
+
+    # parameter groups for normalization, bias, and embedding
+    params += get_optimizer_param_groups_weight_decay(
+        model,
+        weight_decay=cfg.SOLVER.WEIGHT_DECAY,
+        weight_decay_norm=cfg.SOLVER.WEIGHT_DECAY_NORM,
+        weight_decay_bias=cfg.SOLVER.WEIGHT_DECAY_BIAS,
+        weight_decay_embed=cfg.SOLVER.WEIGHT_DECAY_EMBED,
+    )
+
+    # parameter groups from model function `model.get_optimizer_param_groups(opts)`
+    if isinstance(model, torch.nn.parallel.DistributedDataParallel):
+        model = model.module
+    if hasattr(model, "get_optimizer_param_groups"):
+        logger.info(
+            "Getting optimizer parameter groups from model.get_optimizer_param_groups()"
+        )
+        params += model.get_optimizer_param_groups(cfg)
+
+    return reduce_param_groups(params)
+
+
+def get_optimizer_param_groups_default(model: OptimizerModelsType):
+    ret = [
+        {
+            "params": list(
+                filter(
+                    lambda x: x.requires_grad,
+                    model.parameters(),
+                )
+            )
+        }
+    ]
+    return ret
+
+
+def get_optimizer_param_groups_lr(
+    model: OptimizerModelsType,
+    base_lr: float,
+    bias_lr_factor: float = 1.0,
+    lr_multipliers_overwrite: Optional[Dict[str, float]] = None,
+):
+    """
+    Allow setting up lr for modules
+    base_lr: lr for all modules
+    bias_lr_factor: scale factor for lr for bias term
+    lr_multipliers_overwrite (dict: str-> float):
+        Applying different lr multiplier to a set of parameters whose names
+        containing certain keys. For example, if lr_multipliers_overwrite={'backbone': 0.1},
+        the LR for the parameters whose names containing 'backbone' will be scaled to 0.1x.
+        Set lr_multipliers_overwrite=None if no multipliers required.
+    """
+    params: List[Dict[str, Any]] = []
+    for (
+        module_name,
+        _module,
+        module_param_name,
+        value,
+    ) in iterate_module_named_parameters(model):
+        cur_lr = base_lr
+        if module_param_name == "bias":
+            cur_lr = base_lr * bias_lr_factor
+        if lr_multipliers_overwrite is not None:
+            for kname, mult in lr_multipliers_overwrite.items():
+                if kname in module_name:
+                    # apply multiplier for the params containing kname, e.g. backbone
+                    cur_lr = cur_lr * mult
+
+        params += [
+            {
+                "params": [value],
+                "lr": cur_lr,
+            }
+        ]
+
+    return params
+
+
+def get_optimizer_param_groups_weight_decay(
+    model: OptimizerModelsType,
+    weight_decay: Optional[float],
+    weight_decay_norm: Optional[float] = None,
+    weight_decay_bias: Optional[float] = None,
+    weight_decay_embed: Optional[float] = None,
+):
+    """
+    Allow setting up weight decay for normalization, embedding and bias
+    """
+    if weight_decay_norm is None:
+        weight_decay_norm = weight_decay
+    if weight_decay_bias is None:
+        weight_decay_bias = weight_decay
+    if weight_decay_embed is None:
+        weight_decay_embed = weight_decay
+
+    norm_module_types = (
+        torch.nn.BatchNorm1d,
+        torch.nn.BatchNorm2d,
+        torch.nn.BatchNorm3d,
+        torch.nn.SyncBatchNorm,
+        torch.nn.GroupNorm,
+        torch.nn.InstanceNorm1d,
+        torch.nn.InstanceNorm2d,
+        torch.nn.InstanceNorm3d,
+        torch.nn.LayerNorm,
+        torch.nn.LocalResponseNorm,
+    )
+    params: List[Dict[str, Any]] = []
+    for (
+        _module_name,
+        module,
+        module_param_name,
+        value,
+    ) in iterate_module_named_parameters(model):
+        cur_wd = weight_decay
+        if isinstance(module, norm_module_types):
+            cur_wd = weight_decay_norm
+        elif isinstance(module, torch.nn.Embedding):
+            cur_wd = weight_decay_embed
+        elif module_param_name == "bias":
+            cur_wd = weight_decay_bias
+        if cur_wd is not None:
+            params += [
+                {
+                    "params": [value],
+                    "weight_decay": cur_wd,
+                }
+            ]
+
+    return params
+
+
+def get_optimizer_param_groups_override(
+    model: OptimizerModelsType,
+    overrides: Optional[Dict[str, Dict[str, float]]] = None,
+):
+    """
+    Allow setting up overrides for parameter groups
+    overrides (dict: str -> (dict: str -> float)):
+        if not `None`, provides values for optimizer hyperparameters
+        (LR, weight decay) for module parameters with a given name; e.g.
+        {"embedding": {"lr": 0.01, "weight_decay": 0.1}} will set the LR and
+        weight decay values for all module parameters named `embedding` (default: None)
+    """
+
+    params: List[Dict[str, Any]] = []
+
+    if overrides is None:
+        return params
+
+    for (
+        _module_name,
+        _module,
+        module_param_name,
+        value,
+    ) in iterate_module_named_parameters(model):
+        schedule_params = {}
+        if module_param_name in overrides:
+            schedule_params.update(overrides[module_param_name])
+            params += [{"params": [value], **schedule_params}]
+
+    return params
+
+
+def maybe_add_gradient_clipping(cfg, optim):  # optim: the optimizer class
+    # detectron2 doesn't have full model gradient clipping now
+    clip_norm_val = cfg.SOLVER.CLIP_GRADIENTS.CLIP_VALUE
+    enable = (
+        cfg.SOLVER.CLIP_GRADIENTS.ENABLED
+        and cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model"
+        and clip_norm_val > 0.0
+    )
+
+    class FullModelGradientClippingOptimizer(optim):
+        def step(self, closure=None):
+            all_params = itertools.chain(*[x["params"] for x in self.param_groups])
+            torch.nn.utils.clip_grad_norm_(all_params, clip_norm_val)
+            super().step(closure=closure)
+
+    if enable:
+        return FullModelGradientClippingOptimizer
+    return d2_maybe_add_gradient_clipping(cfg, optim)
+
+
+def _merge_dict(in_dict):
+    ret_dict = {}
+    assert all(isinstance(x, dict) for x in in_dict)
+    for dic in in_dict:
+        ret_dict.update(dic)
+    return ret_dict
+
+
+@D2GO_OPTIM_MAPPER_REGISTRY.register()
+def sgd(cfg, model: torch.nn.Module) -> torch.optim.Optimizer:
+    """
+    Build an optimizer from config.
+    """
+    params = get_optimizer_param_groups(model, cfg)
+    return maybe_add_gradient_clipping(cfg, torch.optim.SGD)(
+        params,
+        cfg.SOLVER.BASE_LR,
+        momentum=cfg.SOLVER.MOMENTUM,
+        nesterov=cfg.SOLVER.NESTEROV,
+    )
+
+
+@D2GO_OPTIM_MAPPER_REGISTRY.register()
+def adamw(cfg, model: torch.nn.Module) -> torch.optim.Optimizer:
+    """
+    Build an optimizer from config.
+    """
+    params = get_optimizer_param_groups(model, cfg)
+    return maybe_add_gradient_clipping(cfg, torch.optim.AdamW)(
+        params, cfg.SOLVER.BASE_LR
+    )
+
+
+@D2GO_OPTIM_MAPPER_REGISTRY.register()
+def sgd_mt(cfg, model: torch.nn.Module) -> torch.optim.Optimizer:
+    """
+    Build a multi_tensor SGD optimizer that works significantly faster.
+    This version is expected to be the default implementation for SGD
+    optimizer by end of H1'21. To benefit from the speedup, the number
+    of parameter groups needs to be reduced using `reduce_param_groups`.
+    """
+    params = get_optimizer_param_groups(model, cfg)
+    return maybe_add_gradient_clipping(cfg, torch.optim._multi_tensor.SGD)(
+        params,
+        cfg.SOLVER.BASE_LR,
+        momentum=cfg.SOLVER.MOMENTUM,
+        nesterov=cfg.SOLVER.NESTEROV,
+    )
+
+
+@D2GO_OPTIM_MAPPER_REGISTRY.register()
+def adamw_mt(cfg, model: torch.nn.Module) -> torch.optim.Optimizer:
+    """
+    Build a multi_tensor adamw optimizer that works significantly faster.
+    This version is expected to be the default implementation for adamw
+    optimizer by end of H1'21. To benefit from the speedup, the number
+    of parameter groups needs to be reduced using `reduce_param_groups`.
+    """
+    params = get_optimizer_param_groups(model, cfg)
+    return maybe_add_gradient_clipping(cfg, torch.optim._multi_tensor.AdamW)(
+        params, cfg.SOLVER.BASE_LR
+    )
+
+
+def build_optimizer_mapper(cfg, model):
+    name = cfg.SOLVER.OPTIMIZER
+    optimizer = D2GO_OPTIM_MAPPER_REGISTRY.get(name.lower())(cfg, model)
+
+    def _param_group_str(group):
+        ret = {x: y if x != "params" else len(y) for x, y in group.items()}
+        ret = sorted(ret.items())
+        ret = [f"{x[0]}: {x[1]}" for x in ret]
+        ret = "{" + ", ".join(ret) + "}"
+        return ret
+
+    def _param_groups_str(groups):
+        ret = ""
+        for idx, group in enumerate(groups):
+            ret += f"Param group {idx}: {_param_group_str(group)}\n"
+        return ret
+
+    logger.info(
+        f"optimizer parameter groups:\n{_param_groups_str(optimizer.param_groups)}"
+    )
+
+    return optimizer
diff --git a/yolov7/utils/__init__.py b/yolov7/utils/__init__.py
old mode 100755
new mode 100644
diff --git a/yolov7/utils/allreduce_norm.py b/yolov7/utils/allreduce_norm.py
old mode 100755
new mode 100644
diff --git a/yolov7/utils/boxes.py b/yolov7/utils/boxes.py
old mode 100755
new mode 100644
index 4aed110..6d9d3f2
--- a/yolov7/utils/boxes.py
+++ b/yolov7/utils/boxes.py
@@ -23,19 +23,20 @@
 """
 used in DETR
 """
+
+
 def box_cxcywh_to_xyxy(x):
     x_c, y_c, w, h = x.unbind(-1)
-    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
-         (x_c + 0.5 * w), (y_c + 0.5 * h)]
+    b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)]
     return torch.stack(b, dim=-1)
 
 
 def box_xyxy_to_cxcywh(x):
     x0, y0, x1, y1 = x.unbind(-1)
-    b = [(x0 + x1) / 2, (y0 + y1) / 2,
-         (x1 - x0), (y1 - y0)]
+    b = [(x0 + x1) / 2, (y0 + y1) / 2, (x1 - x0), (y1 - y0)]
     return torch.stack(b, dim=-1)
 
+
 def convert_coco_poly_to_mask(segmentations, height, width):
     masks = []
     for polygons in segmentations:
@@ -53,7 +54,6 @@ def convert_coco_poly_to_mask(segmentations, height, width):
     return masks
 
 
-
 def bboxes_iou(bboxes_a, bboxes_b, xyxy=True):
     if bboxes_a.shape[1] != 4 or bboxes_b.shape[1] != 4:
         raise IndexError
@@ -64,10 +64,14 @@ def bboxes_iou(bboxes_a, bboxes_b, xyxy=True):
         area_a = torch.prod(bboxes_a[:, 2:] - bboxes_a[:, :2], 1)
         area_b = torch.prod(bboxes_b[:, 2:] - bboxes_b[:, :2], 1)
     else:
-        tl = torch.max((bboxes_a[:, None, :2] - bboxes_a[:, None, 2:] / 2),
-                       (bboxes_b[:, :2] - bboxes_b[:, 2:] / 2))
-        br = torch.min((bboxes_a[:, None, :2] + bboxes_a[:, None, 2:] / 2),
-                       (bboxes_b[:, :2] + bboxes_b[:, 2:] / 2))
+        tl = torch.max(
+            (bboxes_a[:, None, :2] - bboxes_a[:, None, 2:] / 2),
+            (bboxes_b[:, :2] - bboxes_b[:, 2:] / 2),
+        )
+        br = torch.min(
+            (bboxes_a[:, None, :2] + bboxes_a[:, None, 2:] / 2),
+            (bboxes_b[:, :2] + bboxes_b[:, 2:] / 2),
+        )
 
         area_a = torch.prod(bboxes_a[:, 2:], 1)
         area_b = torch.prod(bboxes_b[:, 2:], 1)
@@ -117,7 +121,7 @@ def generalized_box_iou(boxes1, boxes2):
 
     return iou - (area - union) / area
 
-    
+
 class IOUloss(nn.Module):
     def __init__(self, reduction="none", loss_type="iou"):
         super(IOUloss, self).__init__()
@@ -144,15 +148,13 @@ def forward(self, pred, target):
         iou = (area_i) / (area_p + area_g - area_i + 1e-16)
 
         if self.loss_type == "iou":
-            loss = 1 - iou ** 2
+            loss = 1 - iou**2
         elif self.loss_type == "giou":
             c_tl = torch.min(
-                (pred[:, :2] - pred[:, 2:] /
-                 2), (target[:, :2] - target[:, 2:] / 2)
+                (pred[:, :2] - pred[:, 2:] / 2), (target[:, :2] - target[:, 2:] / 2)
             )
             c_br = torch.max(
-                (pred[:, :2] + pred[:, 2:] /
-                 2), (target[:, :2] + target[:, 2:] / 2)
+                (pred[:, :2] + pred[:, 2:] / 2), (target[:, :2] + target[:, 2:] / 2)
             )
             area_c = torch.prod(c_br - c_tl, 1)
             giou = iou - (area_c - area_i) / area_c.clamp(1e-16)
@@ -182,15 +184,59 @@ def postprocess(prediction, num_classes, conf_thre=0.7, nms_thre=0.45):
             continue
         # Get score and class with highest confidence
         class_conf, class_pred = torch.max(
-            image_pred[:, 5: 5 + num_classes], 1, keepdim=True)
+            image_pred[:, 5 : 5 + num_classes], 1, keepdim=True
+        )
+
+        conf_mask = (image_pred[:, 4] * class_conf.squeeze() >= conf_thre).squeeze()
+
+        # _, conf_mask = torch.topk((image_pred[:, 4] * class_conf.squeeze()), 1000)
+        # Detections ordered as (x1, y1, x2, y2, obj_conf, class_conf, class_pred)
+        detections = torch.cat((image_pred[:, :5], class_conf, class_pred.float()), 1)
+        detections = detections[conf_mask]
+        if not detections.size(0):
+            continue
+
+        nms_out_index = torchvision.ops.batched_nms(
+            detections[:, :4],
+            detections[:, 4] * detections[:, 5],
+            detections[:, 6],
+            nms_thre,
+        )
+        detections = detections[nms_out_index]
+        if output[i] is None:
+            output[i] = detections
+        else:
+            output[i] = torch.cat((output[i], detections))
+    return output
+
+
+def postprocess_yolox_kpts(prediction, num_classes, conf_thre=0.7, nms_thre=0.45):
+    box_corner = prediction.new(prediction.shape)
+    box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
+    box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
+    box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
+    box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
+    prediction[:, :, :4] = box_corner[:, :, :4]
+
+    output = [None for _ in range(len(prediction))]
+    for i, image_pred in enumerate(prediction):
+
+        # If none are remaining => process next image
+        if not image_pred.size(0):
+            continue
+        # Get score and class with highest confidence
+        class_conf, class_pred = torch.max(
+            image_pred[:, 5 : 5 + num_classes], 1, keepdim=True
+        )
 
-        conf_mask = (image_pred[:, 4] *
-                     class_conf.squeeze() >= conf_thre).squeeze()
+        conf_mask = (image_pred[:, 4] * class_conf.squeeze() >= conf_thre).squeeze()
 
         # _, conf_mask = torch.topk((image_pred[:, 4] * class_conf.squeeze()), 1000)
         # Detections ordered as (x1, y1, x2, y2, obj_conf, class_conf, class_pred)
+        # YOLOX-Keypoints: 4 + 1 + 1 + 3*17
         detections = torch.cat(
-            (image_pred[:, :5], class_conf, class_pred.float()), 1)
+            (image_pred[:, :5], class_conf, class_pred.float(), image_pred[6:]), 1
+        )
         detections = detections[conf_mask]
         if not detections.size(0):
             continue
@@ -232,14 +278,14 @@ def postprocessv5(prediction, num_classes, conf_thre=0.7, nms_thre=0.45):
             continue
         # Get score and class with highest confidence
         class_conf, class_pred = torch.max(
-            image_pred[:, 5: 5 + num_classes], 1, keepdim=True)
+            image_pred[:, 5 : 5 + num_classes], 1, keepdim=True
+        )
 
         conf_mask = (image_pred[:, 4] >= conf_thre).squeeze()
 
         # _, conf_mask = torch.topk((image_pred[:, 4] * class_conf.squeeze()), 1000)
         # Detections ordered as (x1, y1, x2, y2, obj_conf, class_conf, class_pred)
-        detections = torch.cat(
-            (image_pred[:, :5], class_conf, class_pred.float()), 1)
+        detections = torch.cat((image_pred[:, :5], class_conf, class_pred.float()), 1)
         detections = detections[conf_mask]
         if not detections.size(0):
             continue
@@ -258,7 +304,17 @@ def postprocessv5(prediction, num_classes, conf_thre=0.7, nms_thre=0.45):
     return output
 
 
-def postprocess_yolomask(prediction, preds_oriens, xys, whs, dets_anchor_idxes, num_classes, conf_thre=0.7, nms_thre=0.45, orien_thre = 0.3):
+def postprocess_yolomask(
+    prediction,
+    preds_oriens,
+    xys,
+    whs,
+    dets_anchor_idxes,
+    num_classes,
+    conf_thre=0.7,
+    nms_thre=0.45,
+    orien_thre=0.3,
+):
     box_corner = prediction.new(prediction.shape)
     box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
     box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
@@ -279,12 +335,11 @@ def postprocess_yolomask(prediction, preds_oriens, xys, whs, dets_anchor_idxes,
             continue
         # Get score and class with highest confidence
         class_conf, class_pred = torch.max(
-            image_pred[:, 5: 5 + num_classes], 1, keepdim=True)
+            image_pred[:, 5 : 5 + num_classes], 1, keepdim=True
+        )
 
-        conf_mask = (image_pred[:, 4] *
-                     class_conf.squeeze() >= conf_thre).squeeze()
-        detections = torch.cat(
-            (image_pred[:, :4], class_conf, class_pred.float()), 1)
+        conf_mask = (image_pred[:, 4] * class_conf.squeeze() >= conf_thre).squeeze()
+        detections = torch.cat((image_pred[:, :4], class_conf, class_pred.float()), 1)
 
         detections = detections[conf_mask]
         xy = xy[conf_mask]
@@ -306,10 +361,13 @@ def postprocess_yolomask(prediction, preds_oriens, xys, whs, dets_anchor_idxes,
         wh = wh[nms_out_index]
         anchor_idx = dets_anchor_idx[nms_out_index]
 
-        masks = ((torch.abs(pred_orien[anchor_idx, 0] - xy[:, 0].view(-1, 1, 1)) <
-                  orien_thre * wh[:, 0].view(-1, 1, 1)) &
-                 (torch.abs(pred_orien[anchor_idx, 1] -  xy[:, 1].view(-1, 1, 1)) <
-                  orien_thre * wh[:, 1].view(-1, 1, 1)))
+        masks = (
+            torch.abs(pred_orien[anchor_idx, 0] - xy[:, 0].view(-1, 1, 1))
+            < orien_thre * wh[:, 0].view(-1, 1, 1)
+        ) & (
+            torch.abs(pred_orien[anchor_idx, 1] - xy[:, 1].view(-1, 1, 1))
+            < orien_thre * wh[:, 1].view(-1, 1, 1)
+        )
 
         if output[i] is None:
             output[i] = detections
@@ -325,6 +383,7 @@ def adjust_box_anns(bbox, scale_ratio, padw, padh, w_max, h_max):
     bbox[:, 1::2] = np.clip(bbox[:, 1::2] * scale_ratio + padh, 0, h_max)
     return bbox
 
+
 # for OrienMask
 def bbox_ious2(bbox1, bbox2):
     """IoUs of bounding boxes with x, y, width and height
@@ -341,10 +400,14 @@ def bbox_ious2(bbox1, bbox2):
     b2x1, b2y1 = (bbox2[..., 0:2] - bbox2[..., 2:4] / 2).split(1, -1)
     b2x2, b2y2 = (bbox2[..., 0:2] + bbox2[..., 2:4] / 2).split(1, -1)
 
-    dx = (b1x2.min(b2x2.squeeze(-1).unsqueeze(-2)) -
-          b1x1.max(b2x1.squeeze(-1).unsqueeze(-2))).clamp(min=0)
-    dy = (b1y2.min(b2y2.squeeze(-1).unsqueeze(-2)) -
-          b1y1.max(b2y1.squeeze(-1).unsqueeze(-2))).clamp(min=0)
+    dx = (
+        b1x2.min(b2x2.squeeze(-1).unsqueeze(-2))
+        - b1x1.max(b2x1.squeeze(-1).unsqueeze(-2))
+    ).clamp(min=0)
+    dy = (
+        b1y2.min(b2y2.squeeze(-1).unsqueeze(-2))
+        - b1y1.max(b2y1.squeeze(-1).unsqueeze(-2))
+    ).clamp(min=0)
     inter = dx * dy
 
     area1 = (b1x2 - b1x1) * (b1y2 - b1y1)
@@ -408,7 +471,9 @@ class BoxModeMy(IntEnum):
     """
 
     @staticmethod
-    def convert(box: _RawBoxType, from_mode: "BoxModeMy", to_mode: "BoxModeMy") -> _RawBoxType:
+    def convert(
+        box: _RawBoxType, from_mode: "BoxModeMy", to_mode: "BoxModeMy"
+    ) -> _RawBoxType:
         """
         Args:
             box: can be a k-tuple, k-list or an Nxk array/tensor, where k = 4 or 5
@@ -436,7 +501,10 @@ def convert(box: _RawBoxType, from_mode: "BoxModeMy", to_mode: "BoxModeMy") -> _
             else:
                 arr = box.clone()
 
-        assert to_mode not in [BoxModeMy.XYXY_REL, BoxModeMy.XYWH_REL] and from_mode not in [
+        assert to_mode not in [
+            BoxModeMy.XYXY_REL,
+            BoxModeMy.XYWH_REL,
+        ] and from_mode not in [
             BoxModeMy.XYXY_REL,
             BoxModeMy.XYWH_REL,
         ], "Relative mode not yet supported!"
@@ -496,7 +564,13 @@ def convert(box: _RawBoxType, from_mode: "BoxModeMy", to_mode: "BoxModeMy") -> _
             return arr
 
     @staticmethod
-    def convert_and_normalize(box: _RawBoxType, from_mode: "BoxModeMy", to_mode: "BoxModeMy", ori_w: int, ori_h: int) -> _RawBoxType:
+    def convert_and_normalize(
+        box: _RawBoxType,
+        from_mode: "BoxModeMy",
+        to_mode: "BoxModeMy",
+        ori_w: int,
+        ori_h: int,
+    ) -> _RawBoxType:
         """
         Args:
             box: can be a k-tuple, k-list or an Nxk array/tensor, where k = 4 or 5
@@ -524,7 +598,10 @@ def convert_and_normalize(box: _RawBoxType, from_mode: "BoxModeMy", to_mode: "Bo
             else:
                 arr = box.clone()
 
-        assert to_mode not in [BoxModeMy.XYXY_REL, BoxModeMy.XYWH_REL] and from_mode not in [
+        assert to_mode not in [
+            BoxModeMy.XYXY_REL,
+            BoxModeMy.XYWH_REL,
+        ] and from_mode not in [
             BoxModeMy.XYXY_REL,
             BoxModeMy.XYWH_REL,
         ], "Relative mode not yet supported!"
@@ -584,3 +661,119 @@ def convert_and_normalize(box: _RawBoxType, from_mode: "BoxModeMy", to_mode: "Bo
             return arr.numpy()
         else:
             return arr
+
+
+class IOUlossV6:
+    """Calculate IoU loss."""
+
+    def __init__(self, box_format="xywh", iou_type="ciou", reduction="none", eps=1e-7):
+        """Setting of the class.
+        Args:
+            box_format: (string), must be one of 'xywh' or 'xyxy'.
+            iou_type: (string), can be one of 'ciou', 'diou', 'giou' or 'siou'
+            reduction: (string), specifies the reduction to apply to the output, must be one of 'none', 'mean','sum'.
+            eps: (float), a value to avoid devide by zero error.
+        """
+        self.box_format = box_format
+        self.iou_type = iou_type.lower()
+        self.reduction = reduction
+        self.eps = eps
+
+    def __call__(self, box1, box2):
+        """calculate iou. box1 and box2 are torch tensor with shape [M, 4] and [Nm 4]."""
+        box2 = box2.T
+        if self.box_format == "xyxy":
+            b1_x1, b1_y1, b1_x2, b1_y2 = box1[0], box1[1], box1[2], box1[3]
+            b2_x1, b2_y1, b2_x2, b2_y2 = box2[0], box2[1], box2[2], box2[3]
+        elif self.box_format == "xywh":
+            b1_x1, b1_x2 = box1[0] - box1[2] / 2, box1[0] + box1[2] / 2
+            b1_y1, b1_y2 = box1[1] - box1[3] / 2, box1[1] + box1[3] / 2
+            b2_x1, b2_x2 = box2[0] - box2[2] / 2, box2[0] + box2[2] / 2
+            b2_y1, b2_y2 = box2[1] - box2[3] / 2, box2[1] + box2[3] / 2
+
+        # Intersection area
+        inter = (torch.min(b1_x2, b2_x2) - torch.max(b1_x1, b2_x1)).clamp(0) * (
+            torch.min(b1_y2, b2_y2) - torch.max(b1_y1, b2_y1)
+        ).clamp(0)
+
+        # Union Area
+        w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + self.eps
+        w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + self.eps
+        union = w1 * h1 + w2 * h2 - inter + self.eps
+        iou = inter / union
+
+        cw = torch.max(b1_x2, b2_x2) - torch.min(b1_x1, b2_x1)  # convex width
+        ch = torch.max(b1_y2, b2_y2) - torch.min(b1_y1, b2_y1)  # convex height
+        if self.iou_type == "giou":
+            c_area = cw * ch + self.eps  # convex area
+            iou = iou - (c_area - union) / c_area
+        elif self.iou_type in ["diou", "ciou"]:
+            c2 = cw**2 + ch**2 + self.eps  # convex diagonal squared
+            rho2 = (
+                (b2_x1 + b2_x2 - b1_x1 - b1_x2) ** 2
+                + (b2_y1 + b2_y2 - b1_y1 - b1_y2) ** 2
+            ) / 4  # center distance squared
+            if self.iou_type == "diou":
+                iou = iou - rho2 / c2
+            elif self.iou_type == "ciou":
+                v = (4 / math.pi**2) * torch.pow(
+                    torch.atan(w2 / h2) - torch.atan(w1 / h1), 2
+                )
+                with torch.no_grad():
+                    alpha = v / (v - iou + (1 + self.eps))
+                iou = iou - (rho2 / c2 + v * alpha)
+        elif self.iou_type == "siou":
+            # SIoU Loss https://arxiv.org/pdf/2205.12740.pdf
+            s_cw = (b2_x1 + b2_x2 - b1_x1 - b1_x2) * 0.5
+            s_ch = (b2_y1 + b2_y2 - b1_y1 - b1_y2) * 0.5
+            sigma = torch.pow(s_cw**2 + s_ch**2, 0.5)
+            sin_alpha_1 = torch.abs(s_cw) / sigma
+            sin_alpha_2 = torch.abs(s_ch) / sigma
+            threshold = pow(2, 0.5) / 2
+            sin_alpha = torch.where(sin_alpha_1 > threshold, sin_alpha_2, sin_alpha_1)
+            angle_cost = torch.cos(torch.arcsin(sin_alpha) * 2 - math.pi / 2)
+            rho_x = (s_cw / cw) ** 2
+            rho_y = (s_ch / ch) ** 2
+            gamma = angle_cost - 2
+            distance_cost = 2 - torch.exp(gamma * rho_x) - torch.exp(gamma * rho_y)
+            omiga_w = torch.abs(w1 - w2) / torch.max(w1, w2)
+            omiga_h = torch.abs(h1 - h2) / torch.max(h1, h2)
+            shape_cost = torch.pow(1 - torch.exp(-1 * omiga_w), 4) + torch.pow(
+                1 - torch.exp(-1 * omiga_h), 4
+            )
+            iou = iou - 0.5 * (distance_cost + shape_cost)
+        loss = 1.0 - iou
+
+        if self.reduction == "sum":
+            loss = loss.sum()
+        elif self.reduction == "mean":
+            loss = loss.mean()
+
+        return loss
+
+
+def pairwise_bbox_iou(box1, box2, box_format="xywh"):
+    """Calculate iou.
+    This code is based on https://github.com/Megvii-BaseDetection/YOLOX/blob/main/yolox/utils/boxes.py
+    """
+    if box_format == "xyxy":
+        lt = torch.max(box1[:, None, :2], box2[:, :2])
+        rb = torch.min(box1[:, None, 2:], box2[:, 2:])
+        area_1 = torch.prod(box1[:, 2:] - box1[:, :2], 1)
+        area_2 = torch.prod(box2[:, 2:] - box2[:, :2], 1)
+
+    elif box_format == "xywh":
+        lt = torch.max(
+            (box1[:, None, :2] - box1[:, None, 2:] / 2),
+            (box2[:, :2] - box2[:, 2:] / 2),
+        )
+        rb = torch.min(
+            (box1[:, None, :2] + box1[:, None, 2:] / 2),
+            (box2[:, :2] + box2[:, 2:] / 2),
+        )
+
+        area_1 = torch.prod(box1[:, 2:], 1)
+        area_2 = torch.prod(box2[:, 2:], 1)
+    valid = (lt < rb).type(lt.type()).prod(dim=2)
+    inter = torch.prod(rb - lt, 2) * valid
+    return inter / (area_1[:, None] + area_2 - inter)
diff --git a/yolov7/utils/checkpoint.py b/yolov7/utils/checkpoint.py
new file mode 100644
index 0000000..e7852bf
--- /dev/null
+++ b/yolov7/utils/checkpoint.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+# -*- coding:utf-8 -*-
+import os
+import shutil
+import torch
+import os.path as osp
+from alfred import logger as LOGGER
+from torch import nn
+
+
+def fuse_conv_and_bn(conv, bn):
+    # Fuse convolution and batchnorm layers https://tehnokv.com/posts/fusing-batchnorm-and-conv/
+    fusedconv = (
+        nn.Conv2d(
+            conv.in_channels,
+            conv.out_channels,
+            kernel_size=conv.kernel_size,
+            stride=conv.stride,
+            padding=conv.padding,
+            groups=conv.groups,
+            bias=True,
+        )
+        .requires_grad_(False)
+        .to(conv.weight.device)
+    )
+
+    # prepare filters
+    w_conv = conv.weight.clone().view(conv.out_channels, -1)
+    w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var)))
+    fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.shape))
+
+    # prepare spatial bias
+    b_conv = (
+        torch.zeros(conv.weight.size(0), device=conv.weight.device)
+        if conv.bias is None
+        else conv.bias
+    )
+    b_bn = bn.bias - bn.weight.mul(bn.running_mean).div(
+        torch.sqrt(bn.running_var + bn.eps)
+    )
+    fusedconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn)
+
+    return fusedconv
+
+
+def fuse_model(model):
+    # from yolov6.layers.common import Conv
+    # for m in model.modules():
+    #     if type(m) is Conv and hasattr(m, "bn"):
+    #         m.conv = fuse_conv_and_bn(m.conv, m.bn)  # update conv
+    #         delattr(m, "bn")  # remove batchnorm
+    #         m.forward = m.forward_fuse  # update forward
+    return model
+
+
+def load_state_dict(weights, model, map_location=None):
+    """Load weights from checkpoint file, only assign weights those layers' name and shape are match."""
+    ckpt = torch.load(weights, map_location=map_location)
+    state_dict = ckpt["model"].float().state_dict()
+    model_state_dict = model.state_dict()
+    state_dict = {
+        k: v
+        for k, v in state_dict.items()
+        if k in model_state_dict and v.shape == model_state_dict[k].shape
+    }
+    model.load_state_dict(state_dict, strict=False)
+    del ckpt, state_dict, model_state_dict
+    return model
+
+
+def load_checkpoint(weights, map_location=None, inplace=True, fuse=True):
+    """Load model from checkpoint file."""
+    LOGGER.info("Loading checkpoint from {}".format(weights))
+    ckpt = torch.load(weights, map_location=map_location)  # load
+    model = ckpt["ema" if ckpt.get("ema") else "model"].float()
+    if fuse:
+        LOGGER.info("\nFusing model...")
+        model = fuse_model(model).eval()
+    else:
+        model = model.eval()
+    return model
+
+
+def save_checkpoint(ckpt, is_best, save_dir, model_name=""):
+    """Save checkpoint to the disk."""
+    if not osp.exists(save_dir):
+        os.makedirs(save_dir)
+    filename = osp.join(save_dir, model_name + ".pt")
+    torch.save(ckpt, filename)
+    if is_best:
+        best_filename = osp.join(save_dir, "best_ckpt.pt")
+        shutil.copyfile(filename, best_filename)
+
+
+def strip_optimizer(ckpt_dir):
+    for s in ["best", "last"]:
+        ckpt_path = osp.join(ckpt_dir, "{}_ckpt.pt".format(s))
+        if not osp.exists(ckpt_path):
+            continue
+        ckpt = torch.load(ckpt_path, map_location=torch.device("cpu"))
+        if ckpt.get("ema"):
+            ckpt["model"] = ckpt["ema"]  # replace model with ema
+        for k in ["optimizer", "ema", "updates"]:  # keys
+            ckpt[k] = None
+        ckpt["epoch"] = -1
+        ckpt["model"].half()  # to FP16
+        for p in ckpt["model"].parameters():
+            p.requires_grad = False
+        torch.save(ckpt, ckpt_path)
diff --git a/yolov7/utils/d2overrides.py b/yolov7/utils/d2overrides.py
new file mode 100644
index 0000000..fde52d6
--- /dev/null
+++ b/yolov7/utils/d2overrides.py
@@ -0,0 +1,105 @@
+"""
+function I want to override in detectron2
+"""
+from detectron2.utils import comm
+from detectron2.utils.collect_env import collect_env_info
+from detectron2.utils.env import seed_all_rng
+from detectron2.utils.file_io import PathManager
+from detectron2.utils.logger import setup_logger
+from omegaconf import OmegaConf
+from torch.nn.parallel import DistributedDataParallel
+import torch
+import detectron2.data.transforms as T
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import CfgNode, LazyConfig
+import os
+
+
+def _try_get_key(cfg, *keys, default=None):
+    """
+    Try select keys from cfg until the first key that exists. Otherwise return default.
+    """
+    if isinstance(cfg, CfgNode):
+        cfg = OmegaConf.create(cfg.dump())
+    for k in keys:
+        none = object()
+        p = OmegaConf.select(cfg, k, default=none)
+        if p is not none:
+            return p
+    return default
+
+
+def _highlight(code, filename):
+    try:
+        import pygments
+    except ImportError:
+        return code
+
+    from pygments.lexers import Python3Lexer, YamlLexer
+    from pygments.formatters import Terminal256Formatter
+
+    lexer = Python3Lexer() if filename.endswith(".py") else YamlLexer()
+    code = pygments.highlight(code, lexer, Terminal256Formatter(style="monokai"))
+    return code
+
+
+def default_setup(cfg, args):
+    """
+    Perform some basic common setups at the beginning of a job, including:
+
+    1. Set up the detectron2 logger
+    2. Log basic information about environment, cmdline arguments, and config
+    3. Backup the config to the output directory
+
+    Args:
+        cfg (CfgNode or omegaconf.DictConfig): the full config to be used
+        args (argparse.NameSpace): the command line arguments to be logged
+    """
+    output_dir = _try_get_key(cfg, "OUTPUT_DIR", "output_dir", "train.output_dir")
+    if comm.is_main_process() and output_dir:
+        PathManager.mkdirs(output_dir)
+
+    rank = comm.get_rank()
+    setup_logger(output_dir, distributed_rank=rank, name="fvcore")
+    logger = setup_logger(output_dir, distributed_rank=rank)
+
+    logger.info(
+        "Rank of current process: {}. World size: {}".format(
+            rank, comm.get_world_size()
+        )
+    )
+    logger.info("Environment info:\n" + collect_env_info())
+
+    logger.info("Command line arguments: " + str(args))
+    if hasattr(args, "config_file") and args.config_file != "":
+        logger.info(
+            "Contents of args.config_file={}:\n{}".format(
+                args.config_file,
+                _highlight(
+                    PathManager.open(args.config_file, "r").read(), args.config_file
+                ),
+            )
+        )
+
+    if comm.is_main_process() and output_dir:
+        # Note: some of our scripts may expect the existence of
+        # config.yaml in output directory
+        path = os.path.join(output_dir, "config.yaml")
+        if isinstance(cfg, CfgNode):
+            logger.info(f"Running with full config was omitted.")
+            with PathManager.open(path, "w") as f:
+                f.write(cfg.dump())
+        else:
+            LazyConfig.save(cfg, path)
+        logger.info("Full config saved to {}".format(path))
+
+    # make sure each worker has a different, yet deterministic seed if specified
+    seed = _try_get_key(cfg, "SEED", "train.seed", default=-1)
+    seed_all_rng(None if seed < 0 else seed + rank)
+
+    # cudnn benchmark has large overhead. It shouldn't be used considering the small size of
+    # typical validation set.
+    if not (hasattr(args, "eval_only") and args.eval_only):
+        torch.backends.cudnn.benchmark = _try_get_key(
+            cfg, "CUDNN_BENCHMARK", "train.cudnn_benchmark", default=False
+        )
diff --git a/yolov7/utils/detr_utils.py b/yolov7/utils/detr_utils.py
old mode 100755
new mode 100644
index 570990d..f732ed7
--- a/yolov7/utils/detr_utils.py
+++ b/yolov7/utils/detr_utils.py
@@ -24,6 +24,9 @@ def __init__(self, cost_class: float = 1, cost_bbox: float = 1, cost_giou: float
             cost_class: This is the relative weight of the classification error in the matching cost
             cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
             cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
+
+        softmax suitable for DETR
+        sigmoid suitable for AnchorDETR
         """
         super().__init__()
         self.cost_class = cost_class
@@ -55,8 +58,10 @@ def forward(self, outputs, targets):
         bs, num_queries = outputs["pred_logits"].shape[:2]
 
         # We flatten to compute the cost matrices in a batch
-        out_prob = outputs["pred_logits"].flatten(0, 1).softmax(-1)  # [batch_size * num_queries, num_classes]
-        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
+        # [batch_size * num_queries, num_classes]
+        out_prob = outputs["pred_logits"].flatten(0, 1).softmax(-1)
+        out_bbox = outputs["pred_boxes"].flatten(
+            0, 1)  # [batch_size * num_queries, 4]
 
         # Also concat the target labels and boxes
         tgt_ids = torch.cat([v["labels"] for v in targets])
@@ -71,16 +76,317 @@ def forward(self, outputs, targets):
         cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
 
         # Compute the giou cost betwen boxes
-        cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox))
+        cost_giou = - \
+            generalized_box_iou(box_cxcywh_to_xyxy(
+                out_bbox), box_cxcywh_to_xyxy(tgt_bbox))
 
         # Final cost matrix
-        C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
+        C = self.cost_bbox * cost_bbox + self.cost_class * \
+            cost_class + self.cost_giou * cost_giou
         C = C.view(bs, num_queries, -1).cpu()
 
         sizes = [len(v["boxes"]) for v in targets]
-        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
+        indices = [linear_sum_assignment(c[i])
+                   for i, c in enumerate(C.split(sizes, -1))]
         return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
 
 
+class HungarianMatcherAnchorDETR(nn.Module):
+    """This class computes an assignment between the targets and the predictions of the network
+
+    For efficiency reasons, the targets don't include the no_object. Because of this, in general,
+    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
+    while the others are un-matched (and thus treated as non-objects).
+    """
+
+    def __init__(self,
+                 cost_class: float = 1,
+                 cost_bbox: float = 1,
+                 cost_giou: float = 1):
+        """Creates the matcher
+
+        Params:
+            cost_class: This is the relative weight of the classification error in the matching cost
+            cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
+            cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
+        """
+        super().__init__()
+        self.cost_class = cost_class
+        self.cost_bbox = cost_bbox
+        self.cost_giou = cost_giou
+        assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, "all costs cant be 0"
+
+    def forward(self, outputs, targets):
+        """ Performs the matching
+
+        Params:
+            outputs: This is a dict that contains at least these entries:
+                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
+                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
+
+            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
+                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
+                           objects in the target) containing the class labels
+                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
+
+        Returns:
+            A list of size batch_size, containing tuples of (index_i, index_j) where:
+                - index_i is the indices of the selected predictions (in order)
+                - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds:
+                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        with torch.no_grad():
+            bs, num_queries = outputs["pred_logits"].shape[:2]
+
+            # We flatten to compute the cost matrices in a batch
+            out_prob = outputs["pred_logits"].flatten(0, 1).sigmoid()
+            out_bbox = outputs["pred_boxes"].flatten(
+                0, 1)  # [batch_size * num_queries, 4]
+
+            # Also concat the target labels and boxes
+            tgt_ids = torch.cat([v["labels"] for v in targets])
+            tgt_bbox = torch.cat([v["boxes"] for v in targets])
+
+            # Compute the classification cost.
+            alpha = 0.25
+            gamma = 2.0
+            neg_cost_class = (1 - alpha) * (out_prob ** gamma) * \
+                (-(1 - out_prob + 1e-8).log())
+            pos_cost_class = alpha * \
+                ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
+            cost_class = pos_cost_class[:, tgt_ids] - \
+                neg_cost_class[:, tgt_ids]
+
+            # Compute the L1 cost between boxes
+            cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
+
+            # Compute the giou cost betwen boxes
+            cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox),
+                                             box_cxcywh_to_xyxy(tgt_bbox))
+
+            # Final cost matrix
+            C = self.cost_bbox * cost_bbox + self.cost_class * \
+                cost_class + self.cost_giou * cost_giou
+            C = C.view(bs, num_queries, -1).cpu()
+
+            sizes = [len(v["boxes"]) for v in targets]
+            indices = [linear_sum_assignment(
+                c[i]) for i, c in enumerate(C.split(sizes, -1))]
+            return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
+
+
+class HungarianMatcherSMCA(nn.Module):
+    """This class computes an assignment between the targets and the predictions of the network
+
+    For efficiency reasons, the targets don't include the no_object. Because of this, in general,
+    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
+    while the others are un-matched (and thus treated as non-objects).
+    """
+
+    def __init__(self, cost_class: float = 1, cost_bbox: float = 1, cost_giou: float = 1):
+        """Creates the matcher
+
+        Params:
+            cost_class: This is the relative weight of the classification error in the matching cost
+            cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
+            cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
+        """
+        super().__init__()
+        self.cost_class = cost_class
+        self.cost_bbox = cost_bbox
+        self.cost_giou = cost_giou
+        assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, "all costs cant be 0"
+
+    @torch.no_grad()
+    def forward(self, outputs, targets):
+        """ Performs the matching
+
+        Params:
+            outputs: This is a dict that contains at least these entries:
+                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
+                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
+
+            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
+                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
+                           objects in the target) containing the class labels
+                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
+
+        Returns:
+            A list of size batch_size, containing tuples of (index_i, index_j) where:
+                - index_i is the indices of the selected predictions (in order)
+                - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds:
+                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        bs, num_queries = outputs["pred_logits"].shape[:2]
+
+        # We flatten to compute the cost matrices in a batch
+        # [batch_size * num_queries, num_classes]
+        out_prob = outputs["pred_logits"].flatten(0, 1).sigmoid()
+        out_bbox = outputs["pred_boxes"].flatten(
+            0, 1)  # [batch_size * num_queries, 4]
+
+        # Also concat the target labels and boxes
+        tgt_ids = torch.cat([v["labels"] for v in targets])
+        tgt_bbox = torch.cat([v["boxes"] for v in targets])
+
+        # Compute the classification cost. Contrary to the loss, we don't use the NLL,
+        # but approximate it in 1 - proba[target class].
+        # The 1 is a constant that doesn't change the matching, it can be ommitted.
+
+        alpha = 0.25
+        gamma = 2
+        neg_cost_class = (1 - alpha) * (out_prob ** gamma) * \
+            (-(1 - out_prob + 1e-8).log())
+        pos_cost_class = alpha * \
+            ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
+        cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids]
+
+        # Compute the L1 cost between boxes
+        cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
+
+        # Compute the giou cost betwen boxes
+        cost_giou = - \
+            generalized_box_iou(box_cxcywh_to_xyxy(
+                out_bbox), box_cxcywh_to_xyxy(tgt_bbox))
+
+        # Final cost matrix
+        C = self.cost_bbox * cost_bbox + self.cost_class * \
+            cost_class + self.cost_giou * cost_giou
+        C = C.view(bs, num_queries, -1).cpu()
+
+        sizes = [len(v["boxes"]) for v in targets]
+        indices = [linear_sum_assignment(c[i])
+                   for i, c in enumerate(C.split(sizes, -1))]
+        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
+
+
+class HungarianMatcherD2go(nn.Module):
+    """This class computes an assignment between the targets and the predictions of the network
+
+    For efficiency reasons, the targets don't include the no_object. Because of this, in general,
+    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
+    while the others are un-matched (and thus treated as non-objects).
+    """
+
+    def __init__(
+        self,
+        cost_class: float = 1,
+        cost_bbox: float = 1,
+        cost_giou: float = 1,
+        use_focal_loss=False,
+    ):
+        """Creates the matcher
+
+        Params:
+            cost_class: This is the relative weight of the classification error in the matching cost
+            cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
+            cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
+        """
+        super().__init__()
+        self.cost_class = cost_class
+        self.cost_bbox = cost_bbox
+        self.cost_giou = cost_giou
+        assert (
+            cost_class != 0 or cost_bbox != 0 or cost_giou != 0
+        ), "all costs cant be 0"
+        self.use_focal_loss = use_focal_loss
+
+    @torch.no_grad()
+    def forward(self, outputs, targets):
+        """Performs the matching
+
+        Params:
+            outputs: This is a dict that contains at least these entries:
+                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
+                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
+
+            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
+                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
+                           objects in the target) containing the class labels
+                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
+
+        Returns:
+            A list of size batch_size, containing tuples of (index_i, index_j) where:
+                - index_i is the indices of the selected predictions (in order)
+                - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds:
+                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        bs, num_queries = outputs["pred_logits"].shape[:2]
+
+        # We flatten to compute the cost matrices in a batch
+        if self.use_focal_loss:
+            out_prob = outputs["pred_logits"].flatten(0, 1).sigmoid()
+        else:
+            out_prob = (
+                outputs["pred_logits"].flatten(0, 1).softmax(-1)
+            )  # [batch_size * num_queries, num_classes]
+        out_bbox = outputs["pred_boxes"].flatten(
+            0, 1)  # [batch_size * num_queries, 4]
+
+        # Also concat the target labels and boxes
+        tgt_ids = torch.cat([v["labels"]
+                            for v in targets])  # [\sum_b NUM-BOX_b,]
+        tgt_bbox = torch.cat([v["boxes"]
+                             for v in targets])  # [\sum_b NUM-BOX_b, 4]
+
+        # Compute the classification cost. Contrary to the loss, we don't use the NLL,
+        # but approximate it in 1 - proba[target class].
+        # The 1 is a constant that doesn't change the matching, it can be omitted.
+        if self.use_focal_loss:
+            alpha = 0.25
+            gamma = 2.0
+            neg_cost_class = (
+                (1 - alpha) * (out_prob ** gamma) *
+                (-(1 - out_prob + 1e-8).log())
+            )
+            pos_cost_class = (
+                alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
+            )
+            cost_class = pos_cost_class[:, tgt_ids] - \
+                neg_cost_class[:, tgt_ids]
+        else:
+            cost_class = -out_prob[
+                :, tgt_ids
+            ]  # shape [batch_size * num_queries, \sum_b NUM-BOX_b]
+
+        # Compute the L1 cost between boxes
+        cost_bbox = torch.cdist(
+            out_bbox, tgt_bbox, p=1
+        )  # shape [batch_size * num_queries,\sum_b NUM-BOX_b]
+
+        # Compute the giou cost betwen boxes
+        # shape [batch_size * num_queries, \sum_b NUM-BOX_b]
+        cost_giou = -generalized_box_iou(
+            box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox)
+        )
+
+        # Final cost matrix
+        C = (
+            self.cost_bbox * cost_bbox
+            + self.cost_class * cost_class
+            + self.cost_giou * cost_giou
+        )
+        C = C.view(
+            bs, num_queries, -1
+        ).cpu()  # shape [batch_size, num_queries, \sum_b NUM-BOX_b]
+
+        sizes = [len(v["boxes"]) for v in targets]  # shape [batch_size,]
+        # each split c shape [batch_size, num_queries, NUM-BOX_b]
+        indices = [
+            linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))
+        ]
+        # A list where each item is [row_indices, col_indices]
+        return [
+            (
+                torch.as_tensor(i, dtype=torch.int64),
+                torch.as_tensor(j, dtype=torch.int64),
+            )
+            for i, j in indices
+        ]
+
+
 def build_matcher(args):
     return HungarianMatcher(cost_class=args.set_cost_class, cost_bbox=args.set_cost_bbox, cost_giou=args.set_cost_giou)
diff --git a/yolov7/utils/get_default_cfg.py b/yolov7/utils/get_default_cfg.py
new file mode 100644
index 0000000..bb1cf05
--- /dev/null
+++ b/yolov7/utils/get_default_cfg.py
@@ -0,0 +1,18 @@
+
+
+def get_default_solver_configs(_C):
+     # Set default optimizer
+    _C.SOLVER.OPTIMIZER = "sgd"
+    _C.SOLVER.LR_MULTIPLIER_OVERWRITE = []
+    _C.SOLVER.WEIGHT_DECAY_EMBED = 0.0
+
+    # Default world size in D2 is 0, which means scaling is not applied. For D2Go
+    # auto scale is encouraged, setting it to 8
+    assert _C.SOLVER.REFERENCE_WORLD_SIZE == 0
+    _C.SOLVER.REFERENCE_WORLD_SIZE = 8
+    # Besides scaling default D2 configs, also scale quantization configs
+    _C.SOLVER.AUTO_SCALING_METHODS = [
+        "default_scale_d2_configs",
+        "default_scale_quantization_configs",
+    ]
+    return _C
\ No newline at end of file
diff --git a/yolov7/utils/misc.py b/yolov7/utils/misc.py
old mode 100755
new mode 100644
index 5883fce..022a6d9
--- a/yolov7/utils/misc.py
+++ b/yolov7/utils/misc.py
@@ -12,6 +12,14 @@
 
 # needed due to empty tensor bug in pytorch and torchvision 0.5
 import torchvision
+import math
+
+
+def make_divisible(x, divisor):
+    # Upward revision the value x to make it evenly divisible by the divisor.
+    return math.ceil(x / divisor) * divisor
+
+
 if float(torchvision.__version__.split(".")[1]) < 7.0:
     from torchvision.ops import _new_empty_tensor
     from torchvision.ops.misc import _output_size
@@ -25,6 +33,22 @@ def is_dist_avail_and_initialized():
     return True
 
 
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def is_main_process():
+    return get_rank() == 0
+
+
 class NestedTensor(object):
     def __init__(self, tensors, mask: Optional[Tensor]):
         self.tensors = tensors
@@ -47,7 +71,7 @@ def decompose(self):
 
     def __repr__(self):
         return str(self.tensors)
-    
+
 
 def _max_by_axis(the_list):
     # type: (List[List[int]]) -> List[int]
@@ -78,9 +102,9 @@ def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
         mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
         for img, pad_img, m in zip(tensor_list, tensor, mask):
             pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
-            m[: img.shape[1], :img.shape[2]] = False
+            m[: img.shape[1], : img.shape[2]] = False
     else:
-        raise ValueError('not supported')
+        raise ValueError("not supported")
     return NestedTensor(tensor, mask)
 
 
@@ -90,7 +114,9 @@ def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
 def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
     max_size = []
     for i in range(tensor_list[0].dim()):
-        max_size_i = torch.max(torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)).to(torch.int64)
+        max_size_i = torch.max(
+            torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)
+        ).to(torch.int64)
         max_size.append(max_size_i)
     max_size = tuple(max_size)
 
@@ -102,11 +128,15 @@ def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTen
     padded_masks = []
     for img in tensor_list:
         padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
-        padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
+        padded_img = torch.nn.functional.pad(
+            img, (0, padding[2], 0, padding[1], 0, padding[0])
+        )
         padded_imgs.append(padded_img)
 
         m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
-        padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
+        padded_mask = torch.nn.functional.pad(
+            m, (0, padding[2], 0, padding[1]), "constant", 1
+        )
         padded_masks.append(padded_mask.to(torch.bool))
 
     tensor = torch.stack(padded_imgs)
@@ -114,8 +144,36 @@ def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTen
 
     return NestedTensor(tensor, mask=mask)
 
+
+def nested_masks_from_list(tensor_list: List[Tensor], input_shape=None):
+    if tensor_list[0].ndim == 3:
+        dim_size = sum([img.shape[0] for img in tensor_list])
+        if input_shape is None:
+            max_size = _max_by_axis([list(img.shape[-2:]) for img in tensor_list])
+        else:
+            max_size = [input_shape[0], input_shape[1]]
+        batch_shape = [dim_size] + max_size
+        # b, h, w = batch_shape
+        dtype = tensor_list[0].dtype
+        device = tensor_list[0].device
+        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
+        mask = torch.zeros(batch_shape, dtype=torch.bool, device=device)
+        idx = 0
+        for img in tensor_list:
+            c = img.shape[0]
+            c_ = idx + c
+            tensor[idx:c_, : img.shape[1], : img.shape[2]].copy_(img)
+            mask[idx:c_, : img.shape[1], : img.shape[2]] = True
+            idx = c_
+    else:
+        raise ValueError("not supported")
+    return NestedTensor(tensor, mask)
+
+
 @torch.jit.unused
-def _onnx_nested_tensor_from_tensor_list_no_padding(tensor_list: List[Tensor]) -> NestedTensor:
+def _onnx_nested_tensor_from_tensor_list_no_padding(
+    tensor_list: List[Tensor],
+) -> NestedTensor:
     """
     assume input tensor_list all tensor shape are same.
     """
@@ -126,7 +184,9 @@ def _onnx_nested_tensor_from_tensor_list_no_padding(tensor_list: List[Tensor]) -
     return NestedTensor(imgs, masks)
 
 
-def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corners=None):
+def interpolate(
+    input, size=None, scale_factor=None, mode="nearest", align_corners=None
+):
     # type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor
     """
     Equivalent to nn.functional.interpolate, but with support for empty batch sizes.
@@ -143,7 +203,9 @@ class can go away.
         output_shape = list(input.shape[:-2]) + list(output_shape)
         return _new_empty_tensor(input, output_shape)
     else:
-        return torchvision.ops.misc.interpolate(input, size, scale_factor, mode, align_corners)
+        return torchvision.ops.misc.interpolate(
+            input, size, scale_factor, mode, align_corners
+        )
 
 
 @torch.no_grad()
@@ -162,4 +224,11 @@ def accuracy(output, target, topk=(1,)):
     for k in topk:
         correct_k = correct[:k].view(-1).float().sum(0)
         res.append(correct_k.mul_(100.0 / batch_size))
-    return res
\ No newline at end of file
+    return res
+
+
+def inverse_sigmoid(x, eps: float = 1e-5):
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1 / x2)
diff --git a/yolov7/utils/qat_utils.py b/yolov7/utils/qat_utils.py
new file mode 100644
index 0000000..3fbb70d
--- /dev/null
+++ b/yolov7/utils/qat_utils.py
@@ -0,0 +1,219 @@
+#!/usr/bin/env python3
+import logging
+from functools import partial
+
+import torch
+import torch.distributed as dist
+try:
+    from torch.ao.quantization._learnable_fake_quantize import _LearnableFakeQuantize
+except ImportError:
+    print('QAT disabled.')
+
+
+logger = logging.getLogger(__name__)
+
+
+def mixin_with_subclass(module, mix_class):
+    """Create a subclass of type(module) and mix_class while using all the data
+    from the `module` object
+    """
+    ModuleType = type(module)
+
+    class SubClass(mix_class, ModuleType):
+        def __init__(self, module):
+            assert isinstance(module, ModuleType)
+            # initialize the parent by copying the dict directly
+            self.__dict__ = module.__dict__.copy()
+
+    ret = SubClass(module)
+    return ret
+
+
+def _has_module(model, module_type):
+    for x in model.modules():
+        if isinstance(x, module_type):
+            return True
+    return False
+
+
+def check_for_learnable_fake_quant_ops(qat_method, model):
+    """Make sure learnable observers are used if qat method is `learnable`"""
+    if qat_method == "learnable":
+        if not _has_module(model, _LearnableFakeQuantize):
+            raise Exception(
+                "No learnable fake quant is used for learnable quantzation, please use d2go.utils.qat_utils.get_qat_qconfig() to get proper qconfig"
+            )
+
+
+def iterate_module_named_parameters(model, check_requires_grad=True):
+    """Iterate over all parameters for the model"""
+    memo = set()
+    for module_name, module in model.named_modules():
+        for module_param_name, value in module.named_parameters(recurse=False):
+            if check_requires_grad and not value.requires_grad:
+                continue
+            # Avoid duplicating parameters
+            if value in memo:
+                continue
+            memo.add(value)
+
+            yield module_name, module, module_param_name, value
+
+
+def get_qat_qconfig(backend, qat_method="default"):
+    assert backend in ["qnnpack", "fbgemm"]
+    assert qat_method in ["default", "learnable"]
+    if qat_method == "default":
+        return torch.quantization.get_default_qat_qconfig(backend)
+
+    ACT_CONFIGS = {
+        # follow `get_default_qat_qconfig()`
+        # fbcode/caffe2/torch/quantization/qconfig.py
+        "fbgemm": {
+            "reduce_range": True,
+        },
+        "qnnpack": {
+            "reduce_range": False,
+        },
+    }
+
+    WEIGHT_CONFIGS = {
+        # follow `default_per_channel_weight_fake_quant`
+        # fbcode/caffe2/torch/quantization/fake_quantize.py
+        "fbgemm": {
+            "observer": torch.quantization.MovingAveragePerChannelMinMaxObserver,
+            "qscheme": torch.per_channel_symmetric,
+            "reduce_range": False,
+            "ch_axis": 0,
+        },
+        # follow `default_weight_fake_quant`
+        # fbcode/caffe2/torch/quantization/fake_quantize.py
+        "qnnpack": {
+            "observer": torch.quantization.MovingAverageMinMaxObserver,
+            "qscheme": torch.per_tensor_symmetric,
+            "reduce_range": False,
+        },
+    }
+
+    act = _LearnableFakeQuantize.with_args(
+        observer=torch.quantization.MovingAverageMinMaxObserver,
+        quant_min=0,
+        quant_max=255,
+        use_grad_scaling=True,
+        **ACT_CONFIGS[backend],
+    )
+    weight = _LearnableFakeQuantize.with_args(
+        quant_min=-128,
+        quant_max=127,
+        dtype=torch.qint8,
+        use_grad_scaling=True,
+        **WEIGHT_CONFIGS[backend],
+    )
+    return torch.quantization.QConfig(activation=act, weight=weight)
+
+
+def get_world_size() -> int:
+    if not dist.is_available():
+        return 1
+    if not dist.is_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def sync_tensor(data):
+    world_size = get_world_size()
+    if world_size > 1:
+        dist.all_reduce(data, op=dist.ReduceOp.SUM)
+        data /= world_size
+
+
+def toggle_lqat_fake_quant(mod, enable):
+    """Toggle fake quantization for learnable qat"""
+    if type(mod) == _LearnableFakeQuantize:
+        mod.toggle_fake_quant(enable)
+
+
+# enable/disable fake quantization for learnable qat
+enable_lqat_fake_quant = partial(toggle_lqat_fake_quant, enable=True)
+disable_lqat_fake_quant = partial(toggle_lqat_fake_quant, enable=False)
+
+
+def toggle_lqat_static_observer(mod, enable):
+    """Toggle static observers for learnable qat"""
+    if type(mod) == _LearnableFakeQuantize:
+        mod.toggle_observer_update(enable)
+
+
+# enable/disable static observer for learnable qat
+enable_lqat_static_observer = partial(toggle_lqat_static_observer, enable=True)
+disable_lqat_static_observer = partial(
+    toggle_lqat_static_observer, enable=False)
+
+
+def enable_lqat_learnable_observer(mod):
+    """Enable learning observers, will disable static observer updates"""
+    if type(mod) == _LearnableFakeQuantize:
+        sync_tensor(mod.scale.data)
+        sync_tensor(mod.zero_point.data)
+        mod.toggle_qparam_learning(
+            enabled=True).toggle_observer_update(enabled=False)
+
+
+def disable_lqat_learnable_observer(mod):
+    """Disable learning observers"""
+    if type(mod) == _LearnableFakeQuantize:
+        mod.toggle_qparam_learning(enabled=False)
+
+
+def get_optimizer_param_groups_learnable_qat(model, _):
+    """Set the weight decay for scale/zero_point for learnable_fake_quant to 0"""
+    params = []
+    for (
+        _module_name,
+        module,
+        module_param_name,
+        value,
+    ) in iterate_module_named_parameters(model, check_requires_grad=False):
+        if isinstance(module, _LearnableFakeQuantize):
+            if module_param_name in ("scale", "zero_point"):
+                params += [
+                    {
+                        "params": [value],
+                        "weight_decay": 0.0,
+                    }
+                ]
+
+    return params
+
+
+def _is_observer_key(state_dict_key):
+    observer_keys = ["activation_post_process", "weight_fake_quant"]
+    return any(x in state_dict_key for x in observer_keys)
+
+
+def _is_q_state_dict(state_dict):
+    return any(_is_observer_key(k) for k in state_dict)
+
+
+class ModelGetOptimizerParamGroupLearnableQATMixin:
+    def get_optimizer_param_groups(self, opts):
+        ret = []
+        if hasattr(super(), "get_optimizer_param_groups"):
+            ret = super().get_optimizer_param_groups(opts)
+        ret += get_optimizer_param_groups_learnable_qat(self, opts)
+        return ret
+
+
+def setup_qat_get_optimizer_param_groups(model, qat_method):
+    """Add a function `get_optimizer_param_groups` to the model so that it could
+    return proper weight decay for learnable qat
+    """
+    if qat_method != "learnable":
+        return model
+
+    assert _is_q_state_dict(model.state_dict())
+
+    model = mixin_with_subclass(
+        model, ModelGetOptimizerParamGroupLearnableQATMixin)
+    assert hasattr(model, "get_optimizer_param_groups")
+    return model
diff --git a/yolov7/utils/solov2_utils.py b/yolov7/utils/solov2_utils.py
old mode 100755
new mode 100644
diff --git a/yolov7/version.py b/yolov7/version.py
new file mode 100644
index 0000000..4e60e2b
--- /dev/null
+++ b/yolov7/version.py
@@ -0,0 +1,19 @@
+# Copyright (c) Lucas Jin. All rights reserved.
+
+__version__ = "0.0.2"
+short_version = __version__
+
+
+def parse_version_info(version_str):
+    version_info = []
+    for x in version_str.split("."):
+        if x.isdigit():
+            version_info.append(int(x))
+        elif x.find("rc") != -1:
+            patch_version = x.split("rc")
+            version_info.append(int(patch_version[0]))
+            version_info.append(f"rc{patch_version[1]}")
+    return tuple(version_info)
+
+
+version_info = parse_version_info(__version__)